From fa84ae26d62c7ac2ad6dca18b2d8b12ab83bc900 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 15:46:46 +0100
Subject: [PATCH] Reformat source code with black.

This is the result of:

    $ black --line-length 119 examples templates transformers utils hubconf.py setup.py

There's a lot of fairly long lines in the project. As a consequence, I'm
picking the longest widely accepted line length, 119 characters.

This is also Thomas' preference, because it allows for explicit variable
names, to make the code easier to understand.
---
 examples/benchmarks.py                        | 169 +--
 examples/contrib/run_camembert.py             |  37 +-
 examples/contrib/run_openai_gpt.py            | 163 +--
 examples/contrib/run_swag.py                  | 450 ++++----
 examples/contrib/run_transfo_xl.py            |  92 +-
 examples/distillation/distiller.py            | 344 +++---
 .../distillation/grouped_batch_sampler.py     |  13 +-
 examples/distillation/lm_seqs_dataset.py      |  32 +-
 .../distillation/run_squad_w_distillation.py  | 656 +++++++-----
 .../distillation/scripts/binarized_data.py    |  67 +-
 examples/distillation/scripts/extract.py      |  94 +-
 .../scripts/extract_distilbert.py             |  85 +-
 examples/distillation/scripts/token_counts.py |  34 +-
 examples/distillation/train.py                | 300 +++---
 examples/distillation/utils.py                |  32 +-
 examples/mm-imdb/run_mmimdb.py                | 413 +++++---
 examples/mm-imdb/utils_mmimdb.py              |  58 +-
 examples/pplm/pplm_classification_head.py     |   1 +
 examples/pplm/run_pplm.py                     | 466 ++++-----
 examples/pplm/run_pplm_discrim_train.py       | 293 ++----
 examples/run_bertology.py                     | 246 +++--
 examples/run_generation.py                    |  64 +-
 examples/run_glue.py                          | 475 +++++----
 examples/run_lm_finetuning.py                 | 440 +++++---
 examples/run_multiple_choice.py               | 454 +++++---
 examples/run_ner.py                           | 376 ++++---
 examples/run_squad.py                         | 678 +++++++-----
 examples/run_tf_glue.py                       |  66 +-
 examples/run_tf_ner.py                        | 510 ++++-----
 examples/run_xnli.py                          | 440 +++++---
 ...ert_bertabs_original_pytorch_checkpoint.py |  45 +-
 examples/summarization/modeling_bertabs.py    | 218 +---
 examples/summarization/run_summarization.py   |  67 +-
 examples/summarization/utils_summarization.py |  12 +-
 .../summarization/utils_summarization_test.py |  24 +-
 examples/test_examples.py                     |  88 +-
 examples/utils_multiple_choice.py             | 135 +--
 examples/utils_ner.py                         |  60 +-
 hubconf.py                                    |  11 +-
 setup.py                                      |  47 +-
 .../adding_a_new_example_script/run_xxx.py    | 573 ++++++----
 .../adding_a_new_example_script/utils_xxx.py  | 287 ++---
 .../adding_a_new_model/configuration_xxx.py   |  42 +-
 ...t_xxx_original_tf_checkpoint_to_pytorch.py |  36 +-
 .../adding_a_new_model/modeling_tf_xxx.py     | 100 +-
 templates/adding_a_new_model/modeling_xxx.py  | 208 ++--
 .../tests/modeling_tf_xxx_test.py             | 173 +--
 .../tests/modeling_xxx_test.py                | 166 +--
 .../tests/tokenization_xxx_test.py            |  30 +-
 .../adding_a_new_model/tokenization_xxx.py    |  65 +-
 transformers/__init__.py                      | 415 +++++---
 transformers/__main__.py                      |  13 +-
 transformers/commands/__init__.py             |   1 +
 transformers/commands/convert.py              | 121 ++-
 transformers/commands/download.py             |  15 +-
 transformers/commands/run.py                  |  77 +-
 transformers/commands/serving.py              |  81 +-
 transformers/commands/train.py                | 138 +--
 transformers/commands/user.py                 |  72 +-
 transformers/configuration_albert.py          |  52 +-
 transformers/configuration_auto.py            |  81 +-
 transformers/configuration_bert.py            |  72 +-
 transformers/configuration_camembert.py       |   5 +-
 transformers/configuration_ctrl.py            |   4 +-
 transformers/configuration_distilbert.py      |  45 +-
 transformers/configuration_gpt2.py            |  16 +-
 transformers/configuration_mmbt.py            |   4 +-
 transformers/configuration_openai.py          |   4 +-
 transformers/configuration_roberta.py         |  15 +-
 transformers/configuration_t5.py              |  38 +-
 transformers/configuration_transfo_xl.py      |  66 +-
 transformers/configuration_utils.py           | 107 +-
 transformers/configuration_xlm.py             |  89 +-
 transformers/configuration_xlm_roberta.py     |  15 +-
 transformers/configuration_xlnet.py           |  55 +-
 ...lbert_original_tf_checkpoint_to_pytorch.py |  36 +-
 ..._bert_original_tf_checkpoint_to_pytorch.py |  36 +-
 ..._bert_pytorch_checkpoint_to_original_tf.py |  66 +-
 ..._gpt2_original_tf_checkpoint_to_pytorch.py |  42 +-
 ...penai_original_tf_checkpoint_to_pytorch.py |  48 +-
 .../convert_pytorch_checkpoint_to_tf2.py      | 463 +++++---
 ..._original_pytorch_checkpoint_to_pytorch.py | 103 +-
 ...rt_t5_original_tf_checkpoint_to_pytorch.py |  36 +-
 ...fo_xl_original_tf_checkpoint_to_pytorch.py |  78 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |  38 +-
 ...xlnet_original_tf_checkpoint_to_pytorch.py |  73 +-
 transformers/data/__init__.py                 |   9 +-
 transformers/data/metrics/__init__.py         |   7 +-
 transformers/data/metrics/squad_metrics.py    | 165 ++-
 transformers/data/processors/__init__.py      |   2 +-
 transformers/data/processors/glue.py          | 256 +++--
 transformers/data/processors/squad.py         | 107 +-
 transformers/data/processors/utils.py         | 110 +-
 transformers/data/processors/xnli.py          |  14 +-
 transformers/file_utils.py                    | 140 ++-
 transformers/hf_api.py                        |  46 +-
 transformers/modelcard.py                     |  80 +-
 transformers/modeling_albert.py               | 206 ++--
 transformers/modeling_auto.py                 | 304 ++++--
 transformers/modeling_bert.py                 | 460 +++++---
 transformers/modeling_camembert.py            |  49 +-
 transformers/modeling_ctrl.py                 | 121 ++-
 transformers/modeling_distilbert.py           | 238 +++--
 transformers/modeling_encoder_decoder.py      |  25 +-
 transformers/modeling_gpt2.py                 | 173 +--
 transformers/modeling_mmbt.py                 | 148 ++-
 transformers/modeling_openai.py               | 147 ++-
 transformers/modeling_roberta.py              | 203 ++--
 transformers/modeling_t5.py                   | 260 +++--
 transformers/modeling_tf_albert.py            | 270 ++---
 transformers/modeling_tf_auto.py              | 264 +++--
 transformers/modeling_tf_bert.py              | 367 ++++---
 transformers/modeling_tf_ctrl.py              | 137 ++-
 transformers/modeling_tf_distilbert.py        | 244 +++--
 transformers/modeling_tf_gpt2.py              | 174 ++--
 transformers/modeling_tf_openai.py            | 167 +--
 transformers/modeling_tf_pytorch_utils.py     | 107 +-
 transformers/modeling_tf_roberta.py           | 102 +-
 transformers/modeling_tf_t5.py                | 294 +++---
 transformers/modeling_tf_transfo_xl.py        | 305 +++---
 .../modeling_tf_transfo_xl_utilities.py       |  87 +-
 transformers/modeling_tf_utils.py             | 179 ++--
 transformers/modeling_tf_xlm.py               | 236 +++--
 transformers/modeling_tf_xlnet.py             | 330 +++---
 transformers/modeling_transfo_xl.py           | 314 +++---
 transformers/modeling_transfo_xl_utilities.py |  77 +-
 transformers/modeling_utils.py                | 421 +++++---
 transformers/modeling_xlm.py                  | 304 ++++--
 transformers/modeling_xlm_roberta.py          |  59 +-
 transformers/modeling_xlnet.py                | 600 +++++++----
 transformers/optimization.py                  |  60 +-
 transformers/optimization_tf.py               | 281 +++--
 transformers/pipelines.py                     | 466 +++++----
 .../tests/configuration_common_test.py        |  11 +-
 transformers/tests/hf_api_test.py             |  28 +-
 transformers/tests/model_card_test.py         |  64 +-
 transformers/tests/modeling_albert_test.py    | 143 +--
 transformers/tests/modeling_auto_test.py      |  23 +-
 transformers/tests/modeling_bert_test.py      | 310 ++++--
 transformers/tests/modeling_common_test.py    | 271 ++---
 transformers/tests/modeling_ctrl_test.py      | 110 +-
 .../tests/modeling_distilbert_test.py         | 128 ++-
 .../tests/modeling_encoder_decoder_test.py    |   6 +-
 transformers/tests/modeling_gpt2_test.py      | 150 +--
 transformers/tests/modeling_openai_test.py    | 117 +--
 transformers/tests/modeling_roberta_test.py   | 185 ++--
 transformers/tests/modeling_t5_test.py        | 137 ++-
 transformers/tests/modeling_tf_albert_test.py | 153 ++-
 transformers/tests/modeling_tf_auto_test.py   |  28 +-
 transformers/tests/modeling_tf_bert_test.py   | 233 +++--
 transformers/tests/modeling_tf_common_test.py |  90 +-
 transformers/tests/modeling_tf_ctrl_test.py   | 103 +-
 .../tests/modeling_tf_distilbert_test.py      | 128 +--
 transformers/tests/modeling_tf_gpt2_test.py   | 140 +--
 .../tests/modeling_tf_openai_gpt_test.py      | 142 +--
 .../tests/modeling_tf_roberta_test.py         | 166 ++-
 transformers/tests/modeling_tf_t5_test.py     |  93 +-
 .../tests/modeling_tf_transfo_xl_test.py      |  97 +-
 transformers/tests/modeling_tf_xlm_test.py    | 239 +++--
 transformers/tests/modeling_tf_xlnet_test.py  | 269 +++--
 .../tests/modeling_transfo_xl_test.py         |  89 +-
 transformers/tests/modeling_xlm_test.py       | 299 ++++--
 transformers/tests/modeling_xlnet_test.py     | 347 ++++--
 transformers/tests/optimization_test.py       |  33 +-
 transformers/tests/optimization_tf_test.py    |  10 +-
 transformers/tests/pipelines_test.py          | 122 +--
 .../tests/tokenization_albert_test.py         |  35 +-
 transformers/tests/tokenization_auto_test.py  |   1 +
 .../tests/tokenization_bert_japanese_test.py  | 129 +--
 transformers/tests/tokenization_bert_test.py  | 104 +-
 transformers/tests/tokenization_ctrl_test.py  |  20 +-
 .../tests/tokenization_distilbert_test.py     |  10 +-
 transformers/tests/tokenization_gpt2_test.py  |  41 +-
 .../tests/tokenization_openai_test.py         |  41 +-
 .../tests/tokenization_roberta_test.py        |  52 +-
 transformers/tests/tokenization_t5_test.py    |  88 +-
 .../tests/tokenization_tests_commons.py       | 177 ++--
 .../tests/tokenization_transfo_xl_test.py     |  38 +-
 transformers/tests/tokenization_utils_test.py |   3 +-
 transformers/tests/tokenization_xlm_test.py   |  42 +-
 transformers/tests/tokenization_xlnet_test.py | 154 ++-
 transformers/tests/utils.py                   |   1 +
 transformers/tokenization_albert.py           | 118 ++-
 transformers/tokenization_auto.py             |  46 +-
 transformers/tokenization_bert.py             | 190 ++--
 transformers/tokenization_bert_japanese.py    | 142 +--
 transformers/tokenization_camembert.py        |  60 +-
 transformers/tokenization_ctrl.py             |  68 +-
 transformers/tokenization_distilbert.py       |  21 +-
 transformers/tokenization_gpt2.py             | 120 ++-
 transformers/tokenization_openai.py           |  92 +-
 transformers/tokenization_roberta.py          |  90 +-
 transformers/tokenization_t5.py               |  79 +-
 transformers/tokenization_transfo_xl.py       | 265 ++---
 transformers/tokenization_utils.py            | 444 ++++----
 transformers/tokenization_xlm.py              | 985 +++++++++---------
 transformers/tokenization_xlm_roberta.py      |  75 +-
 transformers/tokenization_xlnet.py            | 102 +-
 utils/download_glue_data.py                   |  77 +-
 utils/link_tester.py                          |   2 +-
 200 files changed, 17452 insertions(+), 12594 deletions(-)

diff --git a/examples/benchmarks.py b/examples/benchmarks.py
index 26c260b9ec..20b62112b4 100644
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -247,16 +247,18 @@ the wall, slowly on into the Social Predestination Room.
 as they entered."""
 
 
-def create_setup_and_compute(model_names: List[str],
-                             gpu: bool = True,
-                             tensorflow: bool = False,
-                             average_over: int = 3,
-                             torchscript: bool = False,
-                             xla: bool = False,
-                             amp: bool = False,
-                             fp16: bool = False,
-                             save_to_csv: bool = False,
-                             csv_filename: str = f"results_{round(time())}.csv"):
+def create_setup_and_compute(
+    model_names: List[str],
+    gpu: bool = True,
+    tensorflow: bool = False,
+    average_over: int = 3,
+    torchscript: bool = False,
+    xla: bool = False,
+    amp: bool = False,
+    fp16: bool = False,
+    save_to_csv: bool = False,
+    csv_filename: str = f"results_{round(time())}.csv",
+):
     if xla:
         tf.config.optimizer.set_jit(True)
     if amp:
@@ -266,7 +268,7 @@ def create_setup_and_compute(model_names: List[str],
         dictionary = {model_name: {} for model_name in model_names}
         results = _compute_tensorflow(model_names, dictionary, average_over, amp)
     else:
-        device = 'cuda' if (gpu and torch.cuda.is_available()) else 'cpu'
+        device = "cuda" if (gpu and torch.cuda.is_available()) else "cpu"
         dictionary = {model_name: {} for model_name in model_names}
         results = _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16)
 
@@ -276,34 +278,52 @@ def create_setup_and_compute(model_names: List[str],
         for batch_size in results[model_name]["bs"]:
             print("\t\t" + f"===== BATCH SIZE: {batch_size} =====")
             for slice_size in results[model_name]["ss"]:
-                result = results[model_name]['results'][batch_size][slice_size]
+                result = results[model_name]["results"][batch_size][slice_size]
                 if isinstance(result, str):
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: "
-                          f"{result}")
+                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{result}")
                 else:
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: "
-                          f"{(round(1000 * result) / 1000)}"
-                          f"s")
+                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{(round(1000 * result) / 1000)}" f"s")
 
     if save_to_csv:
-        with open(csv_filename, mode='w') as csv_file:
-            fieldnames = ['model',
-                          '1x8', '1x64', '1x128', '1x256', '1x512', '1x1024',
-                          '2x8', '2x64', '2x128', '2x256', '2x512', '2x1024',
-                          '4x8', '4x64', '4x128', '4x256', '4x512', '4x1024',
-                          '8x8', '8x64', '8x128', '8x256', '8x512', '8x1024',
-                          ]
+        with open(csv_filename, mode="w") as csv_file:
+            fieldnames = [
+                "model",
+                "1x8",
+                "1x64",
+                "1x128",
+                "1x256",
+                "1x512",
+                "1x1024",
+                "2x8",
+                "2x64",
+                "2x128",
+                "2x256",
+                "2x512",
+                "2x1024",
+                "4x8",
+                "4x64",
+                "4x128",
+                "4x256",
+                "4x512",
+                "4x1024",
+                "8x8",
+                "8x64",
+                "8x128",
+                "8x256",
+                "8x512",
+                "8x1024",
+            ]
 
             writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
             writer.writeheader()
 
             for model_name in model_names:
                 model_results = {
-                    f'{bs}x{ss}': results[model_name]['results'][bs][ss]
+                    f"{bs}x{ss}": results[model_name]["results"][bs][ss]
                     for bs in results[model_name]["results"]
-                    for ss in results[model_name]['results'][bs]
+                    for ss in results[model_name]["results"][bs]
                 }
-                writer.writerow({'model': model_name, **model_results})
+                writer.writerow({"model": model_name, **model_results})
 
 
 def _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16):
@@ -343,7 +363,7 @@ def _compute_pytorch(model_names, dictionary, average_over, device, torchscript,
 
                         print("Going through model with sequence of shape", sequence.shape)
                         runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
-                        average_time = sum(runtimes)/float(len(runtimes)) / 3.0
+                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                         dictionary[model_name]["results"][batch_size][slice_size] = average_time
                     except RuntimeError as e:
                         print("Doesn't fit on GPU.", e)
@@ -379,7 +399,9 @@ def _compute_tensorflow(model_names, dictionary, average_over, amp):
                 if max_input_size is not None and slice_size > max_input_size:
                     dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
                 else:
-                    sequence = tf.stack([tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size)
+                    sequence = tf.stack(
+                        [tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size
+                    )
 
                     try:
                         print("Going through model with sequence of shape", sequence.shape)
@@ -387,7 +409,7 @@ def _compute_tensorflow(model_names, dictionary, average_over, amp):
                         inference(sequence)
 
                         runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
-                        average_time = sum(runtimes)/float(len(runtimes)) / 3.0
+                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                         dictionary[model_name]["results"][batch_size][slice_size] = average_time
                     except tf.errors.ResourceExhaustedError as e:
                         print("Doesn't fit on GPU.", e)
@@ -399,33 +421,64 @@ def _compute_tensorflow(model_names, dictionary, average_over, amp):
 def main():
     parser = argparse.ArgumentParser()
 
-    parser.add_argument("--models", required=False, type=str, default='all', help="Model checkpoints to be provided "
-                                                                                  "to the AutoModel classes. Leave "
-                                                                                  "blank to benchmark the base version "
-                                                                                  "of all available model "
-                                                                                  "architectures.")
-    parser.add_argument("--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the "
-                                                                             "models")
-    parser.add_argument("--torch_cuda", required=False, action="store_true", help="Pytorch only: run on available "
-                                                                                  "cuda devices")
-    parser.add_argument("--torchscript", required=False, action="store_true", help="Pytorch only: trace the models "
-                                                                                   "using torchscript")
-    parser.add_argument("--tensorflow", required=False, action="store_true", help="Benchmark the TensorFlow version "
-                                                                                  "of the models. Will run on GPU if "
-                                                                                  "the correct dependencies are "
-                                                                                  "installed")
+    parser.add_argument(
+        "--models",
+        required=False,
+        type=str,
+        default="all",
+        help="Model checkpoints to be provided "
+        "to the AutoModel classes. Leave "
+        "blank to benchmark the base version "
+        "of all available model "
+        "architectures.",
+    )
+    parser.add_argument(
+        "--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the " "models"
+    )
+    parser.add_argument(
+        "--torch_cuda", required=False, action="store_true", help="Pytorch only: run on available " "cuda devices"
+    )
+    parser.add_argument(
+        "--torchscript",
+        required=False,
+        action="store_true",
+        help="Pytorch only: trace the models " "using torchscript",
+    )
+    parser.add_argument(
+        "--tensorflow",
+        required=False,
+        action="store_true",
+        help="Benchmark the TensorFlow version "
+        "of the models. Will run on GPU if "
+        "the correct dependencies are "
+        "installed",
+    )
     parser.add_argument("--xla", required=False, action="store_true", help="TensorFlow only: use XLA acceleration.")
-    parser.add_argument("--amp", required=False, action="store_true", help="TensorFlow only: use automatic mixed precision acceleration.")
-    parser.add_argument("--fp16", required=False, action="store_true", help="PyTorch only: use FP16 to accelerate inference.")
-    parser.add_argument("--keras_predict", required=False, action="store_true", help="Whether to use model.predict "
-                                                                                     "instead of model() to do a "
-                                                                                     "forward pass.")
+    parser.add_argument(
+        "--amp",
+        required=False,
+        action="store_true",
+        help="TensorFlow only: use automatic mixed precision acceleration.",
+    )
+    parser.add_argument(
+        "--fp16", required=False, action="store_true", help="PyTorch only: use FP16 to accelerate inference."
+    )
+    parser.add_argument(
+        "--keras_predict",
+        required=False,
+        action="store_true",
+        help="Whether to use model.predict " "instead of model() to do a " "forward pass.",
+    )
     parser.add_argument("--save_to_csv", required=False, action="store_true", help="Save to a CSV file.")
-    parser.add_argument("--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv.")
-    parser.add_argument("--average_over", required=False, default=30, type=int, help="Times an experiment will be run.")
+    parser.add_argument(
+        "--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv."
+    )
+    parser.add_argument(
+        "--average_over", required=False, default=30, type=int, help="Times an experiment will be run."
+    )
 
     args = parser.parse_args()
-    if args.models == 'all':
+    if args.models == "all":
         args.models = [
             "gpt2",
             "bert-base-cased",
@@ -436,7 +489,7 @@ def main():
             "distilbert-base-uncased",
             "distilgpt2",
             "roberta-base",
-            "ctrl"
+            "ctrl",
         ]
     else:
         args.models = args.models.split()
@@ -453,7 +506,7 @@ def main():
                 fp16=args.fp16,
                 save_to_csv=args.save_to_csv,
                 csv_filename=args.csv_filename,
-                average_over=args.average_over
+                average_over=args.average_over,
             )
         else:
             raise ImportError("Trying to run a PyTorch benchmark but PyTorch was not found in the environment.")
@@ -467,11 +520,11 @@ def main():
                 amp=args.amp,
                 save_to_csv=args.save_to_csv,
                 csv_filename=args.csv_filename,
-                average_over=args.average_over
+                average_over=args.average_over,
             )
         else:
             raise ImportError("Trying to run a TensorFlow benchmark but TensorFlow was not found in the environment.")
 
-if __name__ == '__main__':
-    main()
 
+if __name__ == "__main__":
+    main()
diff --git a/examples/contrib/run_camembert.py b/examples/contrib/run_camembert.py
index 28144d5167..99f54f5442 100644
--- a/examples/contrib/run_camembert.py
+++ b/examples/contrib/run_camembert.py
@@ -10,38 +10,37 @@ from transformers.modeling_camembert import CamembertForMaskedLM
 
 def fill_mask(masked_input, model, tokenizer, topk=5):
     # Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
-    assert masked_input.count('<mask>') == 1
+    assert masked_input.count("<mask>") == 1
     input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
     logits = model(input_ids)[0]  # The last hidden-state is the first element of the output tuple
     masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
     logits = logits[0, masked_index, :]
     prob = logits.softmax(dim=0)
     values, indices = prob.topk(k=topk, dim=0)
-    topk_predicted_token_bpe = ' '.join([tokenizer.convert_ids_to_tokens(indices[i].item())
-                                         for i in range(len(indices))])
+    topk_predicted_token_bpe = " ".join(
+        [tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))]
+    )
     masked_token = tokenizer.mask_token
     topk_filled_outputs = []
-    for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(' ')):
-        predicted_token = predicted_token_bpe.replace('\u2581', ' ')
+    for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")):
+        predicted_token = predicted_token_bpe.replace("\u2581", " ")
         if " {0}".format(masked_token) in masked_input:
-            topk_filled_outputs.append((
-                masked_input.replace(
-                    ' {0}'.format(masked_token), predicted_token
-                ),
-                values[index].item(),
-                predicted_token,
-            ))
+            topk_filled_outputs.append(
+                (
+                    masked_input.replace(" {0}".format(masked_token), predicted_token),
+                    values[index].item(),
+                    predicted_token,
+                )
+            )
         else:
-            topk_filled_outputs.append((
-                masked_input.replace(masked_token, predicted_token),
-                values[index].item(),
-                predicted_token,
-            ))
+            topk_filled_outputs.append(
+                (masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
+            )
     return topk_filled_outputs
 
 
-tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
-model = CamembertForMaskedLM.from_pretrained('camembert-base')
+tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
+model = CamembertForMaskedLM.from_pretrained("camembert-base")
 model.eval()
 
 masked_input = "Le camembert est <mask> :)"
diff --git a/examples/contrib/run_openai_gpt.py b/examples/contrib/run_openai_gpt.py
index bc5695becd..f6431c80be 100644
--- a/examples/contrib/run_openai_gpt.py
+++ b/examples/contrib/run_openai_gpt.py
@@ -36,34 +36,42 @@ from tqdm import tqdm, trange
 
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 
-from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
-                                     AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
-                                     get_linear_schedule_with_warmup)
+from transformers import (
+    OpenAIGPTDoubleHeadsModel,
+    OpenAIGPTTokenizer,
+    AdamW,
+    cached_path,
+    WEIGHTS_NAME,
+    CONFIG_NAME,
+    get_linear_schedule_with_warmup,
+)
 
 ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
 
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+)
 logger = logging.getLogger(__name__)
 
+
 def accuracy(out, labels):
     outputs = np.argmax(out, axis=1)
     return np.sum(outputs == labels)
 
+
 def load_rocstories_dataset(dataset_path):
     """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
-    with open(dataset_path, encoding='utf_8') as f:
+    with open(dataset_path, encoding="utf_8") as f:
         f = csv.reader(f)
         output = []
-        next(f) # skip the first line
+        next(f)  # skip the first line
         for line in tqdm(f):
-            output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))
+            output.append((" ".join(line[1:5]), line[5], line[6], int(line[-1]) - 1))
     return output
 
+
 def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
     """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
 
@@ -80,56 +88,68 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
         for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
             with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
             with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
-            input_ids[i, 0, :len(with_cont1)] = with_cont1
-            input_ids[i, 1, :len(with_cont2)] = with_cont2
+            input_ids[i, 0, : len(with_cont1)] = with_cont1
+            input_ids[i, 1, : len(with_cont2)] = with_cont2
             mc_token_ids[i, 0] = len(with_cont1) - 1
             mc_token_ids[i, 1] = len(with_cont2) - 1
-            lm_labels[i, 0, :len(with_cont1)] = with_cont1
-            lm_labels[i, 1, :len(with_cont2)] = with_cont2
+            lm_labels[i, 0, : len(with_cont1)] = with_cont1
+            lm_labels[i, 1, : len(with_cont2)] = with_cont2
             mc_labels[i] = mc_label
         all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
         tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
     return tensor_datasets
 
+
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name', type=str, default='openai-gpt',
-                        help='pretrained model name')
-    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
-    parser.add_argument('--train_dataset', type=str, default='')
-    parser.add_argument('--eval_dataset', type=str, default='')
-    parser.add_argument('--seed', type=int, default=42)
-    parser.add_argument('--num_train_epochs', type=int, default=3)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--eval_batch_size', type=int, default=16)
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument('--max_grad_norm', type=int, default=1)
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training \
-                        steps to perform. Override num_train_epochs.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before\
-                        performing a backward/update pass.")
-    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
-    parser.add_argument('--weight_decay', type=float, default=0.01)
-    parser.add_argument('--lm_coef', type=float, default=0.9)
-    parser.add_argument('--n_valid', type=int, default=374)
+    parser.add_argument("--model_name", type=str, default="openai-gpt", help="pretrained model name")
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--train_dataset", type=str, default="")
+    parser.add_argument("--eval_dataset", type=str, default="")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--num_train_epochs", type=int, default=3)
+    parser.add_argument("--train_batch_size", type=int, default=8)
+    parser.add_argument("--eval_batch_size", type=int, default=16)
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", type=int, default=1)
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training \
+                        steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before\
+                        performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", type=float, default=6.25e-5)
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+    parser.add_argument("--lr_schedule", type=str, default="warmup_linear")
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--lm_coef", type=float, default=0.9)
+    parser.add_argument("--n_valid", type=int, default=374)
 
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
     args = parser.parse_args()
     print(args)
 
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -152,7 +172,7 @@ def main():
     # Load tokenizer and model
     # This loading functions also add new tokens and embeddings called `special tokens`
     # These new embeddings will be fine-tuned on the RocStories dataset
-    special_tokens = ['_start_', '_delimiter_', '_classify_']
+    special_tokens = ["_start_", "_delimiter_", "_classify_"]
     tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
     tokenizer.add_tokens(special_tokens)
     special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens)
@@ -163,6 +183,7 @@ def main():
     # Load and encode the datasets
     if not args.train_dataset and not args.eval_dataset:
         roc_stories = cached_path(ROCSTORIES_URL)
+
     def tokenize_and_encode(obj):
         """ Tokenize and encode a nested object """
         if isinstance(obj, str):
@@ -170,6 +191,7 @@ def main():
         elif isinstance(obj, int):
             return obj
         return list(tokenize_and_encode(o) for o in obj)
+
     logger.info("Encoding dataset...")
     train_dataset = load_rocstories_dataset(args.train_dataset)
     eval_dataset = load_rocstories_dataset(args.eval_dataset)
@@ -178,8 +200,11 @@ def main():
 
     # Compute the max input length for the Transformer
     max_length = model.config.n_positions // 2 - 2
-    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
-                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
+    input_length = max(
+        len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3
+        for dataset in encoded_datasets
+        for story, cont1, cont2, _ in dataset
+    )
     input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model
 
     # Prepare inputs tensors and dataloaders
@@ -198,20 +223,23 @@ def main():
     if args.do_train:
         if args.max_steps > 0:
             t_total = args.max_steps
-            args.num_train_epochs = args.max_steps //\
-                (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+            args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
         else:
-            t_total = len(train_dataloader)\
-                // args.gradient_accumulation_steps * args.num_train_epochs
+            t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
         param_optimizer = list(model.named_parameters())
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
         optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-            ]
+            {
+                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+                "weight_decay": args.weight_decay,
+            },
+            {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+        ]
         optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+        )
 
     if args.do_train:
         nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
@@ -230,14 +258,16 @@ def main():
                 optimizer.step()
                 optimizer.zero_grad()
                 tr_loss += loss.item()
-                exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
+                exp_average_loss = (
+                    loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
+                )
                 nb_tr_steps += 1
                 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, scheduler.get_lr()[0])
 
     # Save a trained model
     if args.do_train:
         # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model itself
+        model_to_save = model.module if hasattr(model, "module") else model  # Only save the model itself
 
         # If we save using the predefined names, we can load using `from_pretrained`
         output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
@@ -260,10 +290,12 @@ def main():
             batch = tuple(t.to(device) for t in batch)
             input_ids, mc_token_ids, lm_labels, mc_labels = batch
             with torch.no_grad():
-               _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels)
+                _, mc_loss, _, mc_logits = model(
+                    input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels
+                )
 
             mc_logits = mc_logits.detach().cpu().numpy()
-            mc_labels = mc_labels.to('cpu').numpy()
+            mc_labels = mc_labels.to("cpu").numpy()
             tmp_eval_accuracy = accuracy(mc_logits, mc_labels)
 
             eval_loss += mc_loss.mean().item()
@@ -274,10 +306,8 @@ def main():
 
         eval_loss = eval_loss / nb_eval_steps
         eval_accuracy = eval_accuracy / nb_eval_examples
-        train_loss = tr_loss/nb_tr_steps if args.do_train else None
-        result = {'eval_loss': eval_loss,
-                  'eval_accuracy': eval_accuracy,
-                  'train_loss': train_loss}
+        train_loss = tr_loss / nb_tr_steps if args.do_train else None
+        result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy, "train_loss": train_loss}
 
         output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
         with open(output_eval_file, "w") as writer:
@@ -286,5 +316,6 @@ def main():
                 logger.info("  %s = %s", key, str(result[key]))
                 writer.write("%s = %s\n" % (key, str(result[key])))
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/examples/contrib/run_swag.py b/examples/contrib/run_swag.py
index 5de93db7fe..d03d1aacec 100644
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -28,8 +28,7 @@ import glob
 
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 
 try:
@@ -39,31 +38,23 @@ except:
 
 from tqdm import tqdm, trange
 
-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForMultipleChoice, BertTokenizer)
+from transformers import WEIGHTS_NAME, BertConfig, BertForMultipleChoice, BertTokenizer
 
 from transformers import AdamW, get_linear_schedule_with_warmup
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
-                  for conf in [BertConfig]), ())
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in [BertConfig]), ())
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForMultipleChoice, BertTokenizer),
+    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
 }
 
+
 class SwagExample(object):
     """A single training/test example for the SWAG dataset."""
-    def __init__(self,
-                 swag_id,
-                 context_sentence,
-                 start_ending,
-                 ending_0,
-                 ending_1,
-                 ending_2,
-                 ending_3,
-                 label = None):
+
+    def __init__(self, swag_id, context_sentence, start_ending, ending_0, ending_1, ending_2, ending_3, label=None):
         self.swag_id = swag_id
         self.context_sentence = context_sentence
         self.start_ending = start_ending
@@ -94,57 +85,49 @@ class SwagExample(object):
 
         return ", ".join(l)
 
-class InputFeatures(object):
-    def __init__(self,
-                 example_id,
-                 choices_features,
-                 label
 
-    ):
+class InputFeatures(object):
+    def __init__(self, example_id, choices_features, label):
         self.example_id = example_id
         self.choices_features = [
-            {
-                'input_ids': input_ids,
-                'input_mask': input_mask,
-                'segment_ids': segment_ids
-            }
+            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
             for _, input_ids, input_mask, segment_ids in choices_features
         ]
         self.label = label
 
+
 def read_swag_examples(input_file, is_training=True):
-    with open(input_file, 'r', encoding='utf-8') as f:
+    with open(input_file, "r", encoding="utf-8") as f:
         reader = csv.reader(f)
         lines = []
         for line in reader:
             if sys.version_info[0] == 2:
-                line = list(unicode(cell, 'utf-8') for cell in line)
+                line = list(unicode(cell, "utf-8") for cell in line)
             lines.append(line)
 
-    if is_training and lines[0][-1] != 'label':
-        raise ValueError(
-            "For training, the input file must contain a label column."
-        )
+    if is_training and lines[0][-1] != "label":
+        raise ValueError("For training, the input file must contain a label column.")
 
     examples = [
         SwagExample(
-            swag_id = line[2],
-            context_sentence = line[4],
-            start_ending = line[5], # in the swag dataset, the
-                                         # common beginning of each
-                                         # choice is stored in "sent2".
-            ending_0 = line[7],
-            ending_1 = line[8],
-            ending_2 = line[9],
-            ending_3 = line[10],
-            label = int(line[11]) if is_training else None
-        ) for line in lines[1:] # we skip the line with the column names
+            swag_id=line[2],
+            context_sentence=line[4],
+            start_ending=line[5],  # in the swag dataset, the
+            # common beginning of each
+            # choice is stored in "sent2".
+            ending_0=line[7],
+            ending_1=line[8],
+            ending_2=line[9],
+            ending_3=line[10],
+            label=int(line[11]) if is_training else None,
+        )
+        for line in lines[1:]  # we skip the line with the column names
     ]
 
     return examples
 
-def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 is_training):
+
+def convert_examples_to_features(examples, tokenizer, max_seq_length, is_training):
     """Loads a data file into a list of `InputBatch`s."""
 
     # Swag is a multiple choice task. To perform this task using Bert,
@@ -204,23 +187,18 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
             logger.info("swag_id: {}".format(example.swag_id))
             for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
                 logger.info("choice: {}".format(choice_idx))
-                logger.info("tokens: {}".format(' '.join(tokens)))
-                logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
-                logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
-                logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
+                logger.info("tokens: {}".format(" ".join(tokens)))
+                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
+                logger.info("input_mask: {}".format(" ".join(map(str, input_mask))))
+                logger.info("segment_ids: {}".format(" ".join(map(str, segment_ids))))
             if is_training:
                 logger.info("label: {}".format(label))
 
-        features.append(
-            InputFeatures(
-                example_id = example.swag_id,
-                choices_features = choices_features,
-                label = label
-            )
-        )
+        features.append(InputFeatures(example_id=example.swag_id, choices_features=choices_features, label=label))
 
     return features
 
+
 def _truncate_seq_pair(tokens_a, tokens_b, max_length):
     """Truncates a sequence pair in place to the maximum length."""
 
@@ -237,18 +215,14 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
         else:
             tokens_b.pop()
 
+
 def accuracy(out, labels):
     outputs = np.argmax(out, axis=1)
     return np.sum(outputs == labels)
 
+
 def select_field(features, field):
-    return [
-        [
-            choice[field]
-            for choice in feature.choices_features
-        ]
-        for feature in features
-    ]
+    return [[choice[field] for choice in feature.choices_features] for feature in features]
 
 
 def set_seed(args):
@@ -258,24 +232,28 @@ def set_seed(args):
     if args.n_gpu > 0:
         torch.cuda.manual_seed_all(args.seed)
 
+
 def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
     if args.local_rank not in [-1, 0]:
         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
 
     # Load data features from cache or dataset file
     input_file = args.predict_file if evaluate else args.train_file
-    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
-        'dev' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length)))
+    cached_features_file = os.path.join(
+        os.path.dirname(input_file),
+        "cached_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+        ),
+    )
     if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", input_file)
         examples = read_swag_examples(input_file)
-        features = convert_examples_to_features(
-            examples, tokenizer, args.max_seq_length, not evaluate)
+        features = convert_examples_to_features(examples, tokenizer, args.max_seq_length, not evaluate)
 
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
@@ -285,21 +263,21 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
 
     # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long)
-    all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long)
-    all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long)
+    all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
+    all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
+    all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
     all_label = torch.tensor([f.label for f in features], dtype=torch.long)
 
     if evaluate:
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_label)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
     else:
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_label)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
 
     if output_examples:
         return dataset, examples, features
     return dataset
+
+
 def train(args, train_dataset, model, tokenizer):
     """ Train the model """
     if args.local_rank in [-1, 0]:
@@ -316,13 +294,18 @@ def train(args, train_dataset, model, tokenizer):
         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
     if args.fp16:
         try:
             from apex import amp
@@ -336,17 +319,21 @@ def train(args, train_dataset, model, tokenizer):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -360,11 +347,13 @@ def train(args, train_dataset, model, tokenizer):
         for step, batch in enumerate(epoch_iterator):
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':       batch[0],
-                      'attention_mask':  batch[1],
-                      #'token_type_ids':  None if args.model_type == 'xlm' else batch[2],
-                      'token_type_ids': batch[2],
-                      'labels':         batch[3]}
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                #'token_type_ids':  None if args.model_type == 'xlm' else batch[2],
+                "token_type_ids": batch[2],
+                "labels": batch[3],
+            }
             # if args.model_type in ['xlnet', 'xlm']:
             #     inputs.update({'cls_index': batch[5],
             #                    'p_mask':       batch[6]})
@@ -372,7 +361,7 @@ def train(args, train_dataset, model, tokenizer):
             loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
             if args.gradient_accumulation_steps > 1:
                 loss = loss / args.gradient_accumulation_steps
 
@@ -393,23 +382,27 @@ def train(args, train_dataset, model, tokenizer):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                     logging_loss = tr_loss
 
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                     # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
                     tokenizer.save_vocabulary(output_dir)
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
@@ -424,6 +417,7 @@ def train(args, train_dataset, model, tokenizer):
 
     return global_step, tr_loss / global_step
 
+
 def evaluate(args, model, tokenizer, prefix=""):
     dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
 
@@ -440,7 +434,6 @@ def evaluate(args, model, tokenizer, prefix=""):
     logger.info("  Num examples = %d", len(dataset))
     logger.info("  Batch size = %d", args.eval_batch_size)
 
-
     eval_loss, eval_accuracy = 0, 0
     nb_eval_steps, nb_eval_examples = 0, 0
 
@@ -448,11 +441,13 @@ def evaluate(args, model, tokenizer, prefix=""):
         model.eval()
         batch = tuple(t.to(args.device) for t in batch)
         with torch.no_grad():
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      # 'token_type_ids': None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
-                      'token_type_ids': batch[2],
-                      'labels':         batch[3]}
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                # 'token_type_ids': None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+                "token_type_ids": batch[2],
+                "labels": batch[3],
+            }
 
             # if args.model_type in ['xlnet', 'xlm']:
             #     inputs.update({'cls_index': batch[4],
@@ -462,17 +457,16 @@ def evaluate(args, model, tokenizer, prefix=""):
             eval_loss += tmp_eval_loss.mean().item()
 
         logits = logits.detach().cpu().numpy()
-        label_ids = inputs['labels'].to('cpu').numpy()
+        label_ids = inputs["labels"].to("cpu").numpy()
         tmp_eval_accuracy = accuracy(logits, label_ids)
         eval_accuracy += tmp_eval_accuracy
 
         nb_eval_steps += 1
-        nb_eval_examples += inputs['input_ids'].size(0)
+        nb_eval_examples += inputs["input_ids"].size(0)
 
     eval_loss = eval_loss / nb_eval_steps
     eval_accuracy = eval_accuracy / nb_eval_examples
-    result = {'eval_loss': eval_loss,
-              'eval_accuracy': eval_accuracy}
+    result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy}
 
     output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
     with open(output_eval_file, "w") as writer:
@@ -483,92 +477,144 @@ def evaluate(args, model, tokenizer, prefix=""):
 
     return result
 
+
 def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--train_file", default=None, type=str, required=True,
-                        help="SWAG csv for training. E.g., train.csv")
-    parser.add_argument("--predict_file", default=None, type=str, required=True,
-                        help="SWAG csv for predictions. E.g., val.csv or test.csv")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model checkpoints and predictions will be written.")
+    parser.add_argument(
+        "--train_file", default=None, type=str, required=True, help="SWAG csv for training. E.g., train.csv"
+    )
+    parser.add_argument(
+        "--predict_file",
+        default=None,
+        type=str,
+        required=True,
+        help="SWAG csv for predictions. E.g., val.csv or test.csv",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model checkpoints and predictions will be written.",
+    )
 
     ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--max_seq_length", default=384, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences "
-                             "longer than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=384,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences "
+        "longer than this will be truncated, and sequences shorter than this will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
 
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
 
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
 
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
     args = parser.parse_args()
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -580,16 +626,24 @@ def main():
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
         args.n_gpu = 1
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
@@ -601,8 +655,12 @@ def main():
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
     config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
-    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
+    )
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -617,7 +675,6 @@ def main():
         global_step, tr_loss = train(args, train_dataset, model, tokenizer)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-
     # Save the trained model and the tokenizer
     if args.local_rank == -1 or torch.distributed.get_rank() == 0:
         # Create output directory if needed
@@ -627,19 +684,20 @@ def main():
         logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
         tokenizer = tokenizer_class.from_pretrained(args.output_dir)
         model.to(args.device)
 
-
     # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
@@ -650,14 +708,16 @@ def main():
             checkpoints = [args.model_name_or_path]
 
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
 
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
 
         for checkpoint in checkpoints:
             # Reload the model
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
             model = model_class.from_pretrained(checkpoint)
             tokenizer = tokenizer_class.from_pretrained(checkpoint)
             model.to(args.device)
@@ -665,7 +725,7 @@ def main():
             # Evaluate
             result = evaluate(args, model, tokenizer, prefix=global_step)
 
-            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
+            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
             results.update(result)
 
     logger.info("Results: {}".format(results))
diff --git a/examples/contrib/run_transfo_xl.py b/examples/contrib/run_transfo_xl.py
index f5375269b8..1ef66bef1e 100644
--- a/examples/contrib/run_transfo_xl.py
+++ b/examples/contrib/run_transfo_xl.py
@@ -30,44 +30,36 @@ import torch
 
 from transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
 
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+)
 logger = logging.getLogger(__name__)
 
+
 def main():
-    parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
-    parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
-                        help='pretrained model name')
-    parser.add_argument('--split', type=str, default='test',
-                        choices=['all', 'valid', 'test'],
-                        help='which split to evaluate')
-    parser.add_argument('--batch_size', type=int, default=10,
-                        help='batch size')
-    parser.add_argument('--tgt_len', type=int, default=128,
-                        help='number of tokens to predict')
-    parser.add_argument('--ext_len', type=int, default=0,
-                        help='length of the extended context')
-    parser.add_argument('--mem_len', type=int, default=1600,
-                        help='length of the retained previous heads')
-    parser.add_argument('--clamp_len', type=int, default=1000,
-                        help='max positional embedding index')
-    parser.add_argument('--no_cuda', action='store_true',
-                        help='Do not use CUDA even though CUA is available')
-    parser.add_argument('--work_dir', type=str, required=True,
-                        help='path to the work_dir')
-    parser.add_argument('--no_log', action='store_true',
-                        help='do not log the eval result')
-    parser.add_argument('--same_length', action='store_true',
-                        help='set same length attention with masking')
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser = argparse.ArgumentParser(description="PyTorch Transformer Language Model")
+    parser.add_argument("--model_name", type=str, default="transfo-xl-wt103", help="pretrained model name")
+    parser.add_argument(
+        "--split", type=str, default="test", choices=["all", "valid", "test"], help="which split to evaluate"
+    )
+    parser.add_argument("--batch_size", type=int, default=10, help="batch size")
+    parser.add_argument("--tgt_len", type=int, default=128, help="number of tokens to predict")
+    parser.add_argument("--ext_len", type=int, default=0, help="length of the extended context")
+    parser.add_argument("--mem_len", type=int, default=1600, help="length of the retained previous heads")
+    parser.add_argument("--clamp_len", type=int, default=1000, help="max positional embedding index")
+    parser.add_argument("--no_cuda", action="store_true", help="Do not use CUDA even though CUA is available")
+    parser.add_argument("--work_dir", type=str, required=True, help="path to the work_dir")
+    parser.add_argument("--no_log", action="store_true", help="do not log the eval result")
+    parser.add_argument("--same_length", action="store_true", help="set same length attention with masking")
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
     args = parser.parse_args()
-    assert args.ext_len >= 0, 'extended context length must be non-negative'
+    assert args.ext_len >= 0, "extended context length must be non-negative"
 
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -84,17 +76,18 @@ def main():
     corpus = TransfoXLCorpus.from_pretrained(args.model_name)
     ntokens = len(corpus.vocab)
 
-    va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
-        device=device, ext_len=args.ext_len)
-    te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
-        device=device, ext_len=args.ext_len)
+    va_iter = corpus.get_iterator("valid", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
+    te_iter = corpus.get_iterator("test", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
 
     # Load a pre-trained model
     model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
     model = model.to(device)
 
-    logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
-        args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
+    logger.info(
+        "Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}".format(
+            args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len
+        )
+    )
 
     model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
     if args.clamp_len > 0:
@@ -108,7 +101,7 @@ def main():
     def evaluate(eval_iter):
         # Turn on evaluation mode which disables dropout.
         model.eval()
-        total_len, total_loss = 0, 0.
+        total_len, total_loss = 0, 0.0
         start_time = time.time()
         with torch.no_grad():
             mems = None
@@ -119,35 +112,34 @@ def main():
                 total_loss += seq_len * loss.item()
                 total_len += seq_len
             total_time = time.time() - start_time
-        logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format(
-                total_time, 1000 * total_time / (idx+1)))
+        logger.info("Time : {:.2f}s, {:.2f}ms/segment".format(total_time, 1000 * total_time / (idx + 1)))
         return total_loss / total_len
 
     # Run on test data.
-    if args.split == 'all':
+    if args.split == "all":
         test_loss = evaluate(te_iter)
         valid_loss = evaluate(va_iter)
-    elif args.split == 'valid':
+    elif args.split == "valid":
         valid_loss = evaluate(va_iter)
         test_loss = None
-    elif args.split == 'test':
+    elif args.split == "test":
         test_loss = evaluate(te_iter)
         valid_loss = None
 
     def format_log(loss, split):
-        log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
-            split, loss, math.exp(loss))
+        log_str = "| {0} loss {1:5.2f} | {0} ppl {2:9.3f} ".format(split, loss, math.exp(loss))
         return log_str
 
-    log_str = ''
+    log_str = ""
     if valid_loss is not None:
-        log_str += format_log(valid_loss, 'valid')
+        log_str += format_log(valid_loss, "valid")
     if test_loss is not None:
-        log_str += format_log(test_loss, 'test')
+        log_str += format_log(test_loss, "test")
 
-    logger.info('=' * 100)
+    logger.info("=" * 100)
     logger.info(log_str)
-    logger.info('=' * 100)
+    logger.info("=" * 100)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py
index d5a86247a8..e3bf0d443e 100644
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -40,14 +40,12 @@ from utils import logger
 from lm_seqs_dataset import LmSeqsDataset
 from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
 
+
 class Distiller:
-    def __init__(self,
-                 params: dict,
-                 dataset: LmSeqsDataset,
-                 token_probs: torch.tensor,
-                 student: nn.Module,
-                 teacher: nn.Module):
-        logger.info('Initializing Distiller')
+    def __init__(
+        self, params: dict, dataset: LmSeqsDataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module
+    ):
+        logger.info("Initializing Distiller")
         self.params = params
         self.dump_path = params.dump_path
         self.multi_gpu = params.multi_gpu
@@ -70,12 +68,10 @@ class Distiller:
         else:
             sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False)
 
-        self.dataloader = DataLoader(dataset=dataset,
-                                     batch_sampler=sampler,
-                                     collate_fn=dataset.batch_sequences)
+        self.dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.batch_sequences)
 
         self.temperature = params.temperature
-        assert self.temperature > 0.
+        assert self.temperature > 0.0
 
         self.alpha_ce = params.alpha_ce
         self.alpha_mlm = params.alpha_mlm
@@ -85,18 +81,18 @@ class Distiller:
 
         self.mlm = params.mlm
         if self.mlm:
-            logger.info(f'Using MLM loss for LM step.')
+            logger.info(f"Using MLM loss for LM step.")
             self.mlm_mask_prop = params.mlm_mask_prop
             assert 0.0 <= self.mlm_mask_prop <= 1.0
             assert params.word_mask + params.word_keep + params.word_rand == 1.0
             self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
-            self.pred_probs = self.pred_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else self.pred_probs
-            self.token_probs = token_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else token_probs
+            self.pred_probs = self.pred_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else self.pred_probs
+            self.token_probs = token_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else token_probs
             if self.fp16:
                 self.pred_probs = self.pred_probs.half()
                 self.token_probs = self.token_probs.half()
         else:
-            logger.info(f'Using CLM loss for LM step.')
+            logger.info(f"Using CLM loss for LM step.")
 
         self.epoch = 0
         self.n_iter = 0
@@ -107,38 +103,54 @@ class Distiller:
         self.last_loss_ce = 0
         self.last_loss_mlm = 0
         self.last_loss_clm = 0
-        if self.alpha_mse > 0.: self.last_loss_mse = 0
-        if self.alpha_cos > 0.: self.last_loss_cos = 0
+        if self.alpha_mse > 0.0:
+            self.last_loss_mse = 0
+        if self.alpha_cos > 0.0:
+            self.last_loss_cos = 0
         self.last_log = 0
 
-        self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean')
+        self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean")
         self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
-        if self.alpha_mse > 0.:
-            self.mse_loss_fct = nn.MSELoss(reduction='sum')
-        if self.alpha_cos > 0.:
-            self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction='mean')
+        if self.alpha_mse > 0.0:
+            self.mse_loss_fct = nn.MSELoss(reduction="sum")
+        if self.alpha_cos > 0.0:
+            self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction="mean")
 
-        logger.info('--- Initializing model optimizer')
+        logger.info("--- Initializing model optimizer")
         assert params.gradient_accumulation_steps >= 1
         self.num_steps_epoch = len(self.dataloader)
-        num_train_optimization_steps = int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
+        num_train_optimization_steps = (
+            int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
+        )
 
-        no_decay = ['bias', 'LayerNorm.weight']
+        no_decay = ["bias", "LayerNorm.weight"]
         optimizer_grouped_parameters = [
-            {'params': [p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': params.weight_decay},
-            {'params': [p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': 0.0}
+            {
+                "params": [
+                    p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad
+                ],
+                "weight_decay": params.weight_decay,
+            },
+            {
+                "params": [
+                    p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad
+                ],
+                "weight_decay": 0.0,
+            },
         ]
-        logger.info("------ Number of trainable parameters (student): %i" % sum([p.numel() for p in self.student.parameters() if p.requires_grad]))
+        logger.info(
+            "------ Number of trainable parameters (student): %i"
+            % sum([p.numel() for p in self.student.parameters() if p.requires_grad])
+        )
         logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()]))
-        self.optimizer = AdamW(optimizer_grouped_parameters,
-                               lr=params.learning_rate,
-                               eps=params.adam_epsilon,
-                               betas=(0.9, 0.98))
+        self.optimizer = AdamW(
+            optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98)
+        )
 
         warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
-        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
-                                                num_warmup_steps=warmup_steps,
-                                                num_training_steps=num_train_optimization_steps)
+        self.scheduler = get_linear_schedule_with_warmup(
+            self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps
+        )
 
         if self.fp16:
             try:
@@ -146,33 +158,36 @@ class Distiller:
             except ImportError:
                 raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
             logger.info(f"Using fp16 training: {self.params.fp16_opt_level} level")
-            self.student, self.optimizer = amp.initialize(self.student,
-                                                          self.optimizer,
-                                                          opt_level=self.params.fp16_opt_level)
+            self.student, self.optimizer = amp.initialize(
+                self.student, self.optimizer, opt_level=self.params.fp16_opt_level
+            )
             self.teacher = self.teacher.half()
 
         if self.multi_gpu:
             if self.fp16:
                 from apex.parallel import DistributedDataParallel
+
                 logger.info("Using apex.parallel.DistributedDataParallel for distributed training.")
                 self.student = DistributedDataParallel(self.student)
             else:
                 from torch.nn.parallel import DistributedDataParallel
+
                 logger.info("Using nn.parallel.DistributedDataParallel for distributed training.")
-                self.student = DistributedDataParallel(self.student,
-                                                       device_ids=[params.local_rank],
-                                                       output_device=params.local_rank,
-                                                       find_unused_parameters=True)
+                self.student = DistributedDataParallel(
+                    self.student,
+                    device_ids=[params.local_rank],
+                    output_device=params.local_rank,
+                    find_unused_parameters=True,
+                )
 
         self.is_master = params.is_master
         if self.is_master:
-            logger.info('--- Initializing Tensorboard')
-            self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, 'log', 'train'))
-            self.tensorboard.add_text(tag='config/training', text_string=str(self.params), global_step=0)
-            self.tensorboard.add_text(tag='config/student', text_string=str(self.student_config), global_step=0)
+            logger.info("--- Initializing Tensorboard")
+            self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, "log", "train"))
+            self.tensorboard.add_text(tag="config/training", text_string=str(self.params), global_step=0)
+            self.tensorboard.add_text(tag="config/student", text_string=str(self.student_config), global_step=0)
 
-    def prepare_batch_mlm(self,
-                          batch):
+    def prepare_batch_mlm(self, batch):
         """
         Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM.
 
@@ -192,7 +207,7 @@ class Distiller:
         token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
         assert token_ids.size(0) == lengths.size(0)
 
-        attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None])
+        attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]
 
         bs, max_seq_len = token_ids.size()
         mlm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
@@ -200,11 +215,13 @@ class Distiller:
         x_prob = self.token_probs[token_ids.flatten()]
         n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item())
         tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False)
-        pred_mask = torch.zeros(bs * max_seq_len, dtype=torch.bool, device=token_ids.device) # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
+        pred_mask = torch.zeros(
+            bs * max_seq_len, dtype=torch.bool, device=token_ids.device
+        )  # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
         pred_mask[tgt_ids] = 1
         pred_mask = pred_mask.view(bs, max_seq_len)
 
-        pred_mask[token_ids == self.params.special_tok_ids['pad_token']] = 0
+        pred_mask[token_ids == self.params.special_tok_ids["pad_token"]] = 0
 
         # mask a number of words == 0 [8] (faster with fp16)
         if self.fp16:
@@ -213,26 +230,29 @@ class Distiller:
                 pred_mask = pred_mask.view(-1)
                 n2 = max(n1 % 8, 8 * (n1 // 8))
                 if n2 != n1:
-                    pred_mask[torch.nonzero(pred_mask).view(-1)[:n1-n2]] = 0
+                    pred_mask[torch.nonzero(pred_mask).view(-1)[: n1 - n2]] = 0
                 pred_mask = pred_mask.view(bs, max_seq_len)
                 assert pred_mask.sum().item() % 8 == 0, pred_mask.sum().item()
 
         _token_ids_real = token_ids[pred_mask]
         _token_ids_rand = _token_ids_real.clone().random_(self.vocab_size)
-        _token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids['mask_token'])
+        _token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids["mask_token"])
         probs = torch.multinomial(self.pred_probs, len(_token_ids_real), replacement=True)
-        _token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long()
+        _token_ids = (
+            _token_ids_mask * (probs == 0).long()
+            + _token_ids_real * (probs == 1).long()
+            + _token_ids_rand * (probs == 2).long()
+        )
         token_ids = token_ids.masked_scatter(pred_mask, _token_ids)
 
-        mlm_labels[~pred_mask] = -100 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
+        mlm_labels[~pred_mask] = -100  # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
 
         # sanity checks
         assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
 
         return token_ids, attn_mask, mlm_labels
 
-    def prepare_batch_clm(self,
-                          batch):
+    def prepare_batch_clm(self, batch):
         """
         Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the labels for CLM.
 
@@ -252,18 +272,16 @@ class Distiller:
         token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
         assert token_ids.size(0) == lengths.size(0)
 
-        attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None])
+        attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]
         clm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
-        clm_labels[~attn_mask] = -100 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
+        clm_labels[~attn_mask] = -100  # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
 
         # sanity checks
         assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
 
         return token_ids, attn_mask, clm_labels
 
-    def round_batch(self,
-                    x: torch.tensor,
-                    lengths: torch.tensor):
+    def round_batch(self, x: torch.tensor, lengths: torch.tensor):
         """
         For float16 only.
         Sub-sample sentences in a batch, and add padding, so that each dimension is a multiple of 8.
@@ -299,9 +317,9 @@ class Distiller:
             pad = 8 - (ml1 % 8)
             ml2 = ml1 + pad
             if self.mlm:
-                pad_id = self.params.special_tok_ids['pad_token']
+                pad_id = self.params.special_tok_ids["pad_token"]
             else:
-                pad_id = self.params.special_tok_ids['unk_token']
+                pad_id = self.params.special_tok_ids["unk_token"]
             padding_tensor = torch.zeros(bs2, pad, dtype=torch.long, device=x.device).fill_(pad_id)
             x = torch.cat([x, padding_tensor], 1)
             assert x.size() == (bs2, ml2)
@@ -314,20 +332,22 @@ class Distiller:
         """
         The real training loop.
         """
-        if self.is_master: logger.info('Starting training')
+        if self.is_master:
+            logger.info("Starting training")
         self.last_log = time.time()
         self.student.train()
         self.teacher.eval()
 
         for _ in range(self.params.n_epoch):
-            if self.is_master: logger.info(f'--- Starting epoch {self.epoch}/{self.params.n_epoch-1}')
+            if self.is_master:
+                logger.info(f"--- Starting epoch {self.epoch}/{self.params.n_epoch-1}")
             if self.multi_gpu:
                 torch.distributed.barrier()
 
             iter_bar = tqdm(self.dataloader, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
             for batch in iter_bar:
                 if self.params.n_gpu > 0:
-                    batch = tuple(t.to(f'cuda:{self.params.local_rank}') for t in batch)
+                    batch = tuple(t.to(f"cuda:{self.params.local_rank}") for t in batch)
 
                 if self.mlm:
                     token_ids, attn_mask, lm_labels = self.prepare_batch_mlm(batch=batch)
@@ -336,22 +356,21 @@ class Distiller:
                 self.step(input_ids=token_ids, attention_mask=attn_mask, lm_labels=lm_labels)
 
                 iter_bar.update()
-                iter_bar.set_postfix({'Last_loss': f'{self.last_loss:.2f}',
-                                      'Avg_cum_loss': f'{self.total_loss_epoch/self.n_iter:.2f}'})
+                iter_bar.set_postfix(
+                    {"Last_loss": f"{self.last_loss:.2f}", "Avg_cum_loss": f"{self.total_loss_epoch/self.n_iter:.2f}"}
+                )
             iter_bar.close()
 
-            if self.is_master: logger.info(f'--- Ending epoch {self.epoch}/{self.params.n_epoch-1}')
+            if self.is_master:
+                logger.info(f"--- Ending epoch {self.epoch}/{self.params.n_epoch-1}")
             self.end_epoch()
 
         if self.is_master:
-            logger.info(f'Save very last checkpoint as `pytorch_model.bin`.')
-            self.save_checkpoint(checkpoint_name=f'pytorch_model.bin')
-            logger.info('Training is finished')
+            logger.info(f"Save very last checkpoint as `pytorch_model.bin`.")
+            self.save_checkpoint(checkpoint_name=f"pytorch_model.bin")
+            logger.info("Training is finished")
 
-    def step(self,
-             input_ids: torch.tensor,
-             attention_mask: torch.tensor,
-             lm_labels: torch.tensor):
+    def step(self, input_ids: torch.tensor, attention_mask: torch.tensor, lm_labels: torch.tensor):
         """
         One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation),
         and possibly a parameter update (depending on the gradient accumulation).
@@ -363,78 +382,91 @@ class Distiller:
         lm_labels: `torch.tensor(bs, seq_length)` - The language modeling labels (mlm labels for MLM and clm labels for CLM).
         """
         if self.mlm:
-            s_logits, s_hidden_states = self.student(input_ids=input_ids, attention_mask=attention_mask)     # (bs, seq_length, voc_size)
+            s_logits, s_hidden_states = self.student(
+                input_ids=input_ids, attention_mask=attention_mask
+            )  # (bs, seq_length, voc_size)
             with torch.no_grad():
-                t_logits, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
+                t_logits, t_hidden_states = self.teacher(
+                    input_ids=input_ids, attention_mask=attention_mask
+                )  # (bs, seq_length, voc_size)
         else:
-            s_logits, _, s_hidden_states = self.student(input_ids=input_ids, attention_mask=None)            # (bs, seq_length, voc_size)
+            s_logits, _, s_hidden_states = self.student(
+                input_ids=input_ids, attention_mask=None
+            )  # (bs, seq_length, voc_size)
             with torch.no_grad():
-                t_logits, _, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=None)           # (bs, seq_length, voc_size)
+                t_logits, _, t_hidden_states = self.teacher(
+                    input_ids=input_ids, attention_mask=None
+                )  # (bs, seq_length, voc_size)
         assert s_logits.size() == t_logits.size()
 
-        #https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
-        #https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
+        # https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
+        # https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
         if self.params.restrict_ce_to_mask:
-            mask = (lm_labels>-1).unsqueeze(-1).expand_as(s_logits)    # (bs, seq_lenth, voc_size)
+            mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits)  # (bs, seq_lenth, voc_size)
         else:
-            mask = attention_mask.unsqueeze(-1).expand_as(s_logits)    # (bs, seq_lenth, voc_size)
-        s_logits_slct = torch.masked_select(s_logits, mask)            # (bs * seq_length * voc_size) modulo the 1s in mask
-        s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1))      # (bs * seq_length, voc_size) modulo the 1s in mask
-        t_logits_slct = torch.masked_select(t_logits, mask)            # (bs * seq_length * voc_size) modulo the 1s in mask
-        t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1))      # (bs * seq_length, voc_size) modulo the 1s in mask
+            mask = attention_mask.unsqueeze(-1).expand_as(s_logits)  # (bs, seq_lenth, voc_size)
+        s_logits_slct = torch.masked_select(s_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
+        s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
+        t_logits_slct = torch.masked_select(t_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
+        t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
         assert t_logits_slct.size() == s_logits_slct.size()
 
-        loss_ce = self.ce_loss_fct(F.log_softmax(s_logits_slct/self.temperature, dim=-1),
-                                   F.softmax(t_logits_slct/self.temperature, dim=-1)) * (self.temperature)**2
-        loss = self.alpha_ce*loss_ce
+        loss_ce = (
+            self.ce_loss_fct(
+                F.log_softmax(s_logits_slct / self.temperature, dim=-1),
+                F.softmax(t_logits_slct / self.temperature, dim=-1),
+            )
+            * (self.temperature) ** 2
+        )
+        loss = self.alpha_ce * loss_ce
 
-        if self.alpha_mlm > 0.:
+        if self.alpha_mlm > 0.0:
             loss_mlm = self.lm_loss_fct(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1))
             loss += self.alpha_mlm * loss_mlm
-        if self.alpha_clm > 0.:
+        if self.alpha_clm > 0.0:
             shift_logits = s_logits[..., :-1, :].contiguous()
             shift_labels = lm_labels[..., 1:].contiguous()
-            loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                                        shift_labels.view(-1))
+            loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
             loss += self.alpha_clm * loss_clm
 
-        if self.alpha_mse > 0.:
-            loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct)/s_logits_slct.size(0) # Reproducing batchmean reduction
+        if self.alpha_mse > 0.0:
+            loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct) / s_logits_slct.size(
+                0
+            )  # Reproducing batchmean reduction
             loss += self.alpha_mse * loss_mse
-        if self.alpha_cos > 0.:
-            s_hidden_states = s_hidden_states[-1]                              # (bs, seq_length, dim)
-            t_hidden_states = t_hidden_states[-1]                              # (bs, seq_length, dim)
-            mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states)     # (bs, seq_length, dim)
+        if self.alpha_cos > 0.0:
+            s_hidden_states = s_hidden_states[-1]  # (bs, seq_length, dim)
+            t_hidden_states = t_hidden_states[-1]  # (bs, seq_length, dim)
+            mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states)  # (bs, seq_length, dim)
             assert s_hidden_states.size() == t_hidden_states.size()
             dim = s_hidden_states.size(-1)
-            
-            s_hidden_states_slct = torch.masked_select(s_hidden_states, mask)        # (bs * seq_length * dim)
-            s_hidden_states_slct = s_hidden_states_slct.view(-1, dim)                # (bs * seq_length, dim)
-            t_hidden_states_slct = torch.masked_select(t_hidden_states, mask)        # (bs * seq_length * dim)
-            t_hidden_states_slct = t_hidden_states_slct.view(-1, dim)                # (bs * seq_length, dim)
-        
-            target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1) # (bs * seq_length,)
+
+            s_hidden_states_slct = torch.masked_select(s_hidden_states, mask)  # (bs * seq_length * dim)
+            s_hidden_states_slct = s_hidden_states_slct.view(-1, dim)  # (bs * seq_length, dim)
+            t_hidden_states_slct = torch.masked_select(t_hidden_states, mask)  # (bs * seq_length * dim)
+            t_hidden_states_slct = t_hidden_states_slct.view(-1, dim)  # (bs * seq_length, dim)
+
+            target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1)  # (bs * seq_length,)
             loss_cos = self.cosine_loss_fct(s_hidden_states_slct, t_hidden_states_slct, target)
             loss += self.alpha_cos * loss_cos
 
         self.total_loss_epoch += loss.item()
         self.last_loss = loss.item()
         self.last_loss_ce = loss_ce.item()
-        if self.alpha_mlm > 0.:
+        if self.alpha_mlm > 0.0:
             self.last_loss_mlm = loss_mlm.item()
-        if self.alpha_clm > 0.:
+        if self.alpha_clm > 0.0:
             self.last_loss_clm = loss_clm.item()
-        if self.alpha_mse > 0.:
+        if self.alpha_mse > 0.0:
             self.last_loss_mse = loss_mse.item()
-        if self.alpha_cos > 0.:
+        if self.alpha_cos > 0.0:
             self.last_loss_cos = loss_cos.item()
 
         self.optimize(loss)
 
         self.n_sequences_epoch += input_ids.size(0)
 
-    def optimize(self,
-                 loss):
+    def optimize(self, loss):
         """
         Normalization on the loss (gradient accumulation or distributed training), followed by
         backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation).
@@ -442,7 +474,7 @@ class Distiller:
         """
         # Check for NaN
         if (loss != loss).data.any():
-            logger.error('NaN detected')
+            logger.error("NaN detected")
             exit()
 
         if self.multi_gpu:
@@ -452,6 +484,7 @@ class Distiller:
 
         if self.fp16:
             from apex import amp
+
             with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                 scaled_loss.backward()
         else:
@@ -488,53 +521,84 @@ class Distiller:
             return
 
         for param_name, param in self.student.named_parameters():
-            self.tensorboard.add_scalar(tag='parameter_mean/' + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter)
-            self.tensorboard.add_scalar(tag='parameter_std/' + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter)
+            self.tensorboard.add_scalar(
+                tag="parameter_mean/" + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter
+            )
+            self.tensorboard.add_scalar(
+                tag="parameter_std/" + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter
+            )
             if param.grad is None:
                 continue
-            self.tensorboard.add_scalar(tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(),global_step=self.n_total_iter)
-            self.tensorboard.add_scalar(tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter)
+            self.tensorboard.add_scalar(
+                tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(), global_step=self.n_total_iter
+            )
+            self.tensorboard.add_scalar(
+                tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter
+            )
 
-        self.tensorboard.add_scalar(tag="losses/cum_avg_loss_epoch", scalar_value=self.total_loss_epoch/self.n_iter, global_step=self.n_total_iter)
+        self.tensorboard.add_scalar(
+            tag="losses/cum_avg_loss_epoch",
+            scalar_value=self.total_loss_epoch / self.n_iter,
+            global_step=self.n_total_iter,
+        )
         self.tensorboard.add_scalar(tag="losses/loss", scalar_value=self.last_loss, global_step=self.n_total_iter)
-        self.tensorboard.add_scalar(tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter)
-        if self.alpha_mlm > 0.:
-            self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter)
-        if self.alpha_clm > 0.:
-            self.tensorboard.add_scalar(tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter)
-        if self.alpha_mse > 0.:
-            self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
-        if self.alpha_cos > 0.:
-            self.tensorboard.add_scalar(tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter)
-        self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter)
-        
-        self.tensorboard.add_scalar(tag="global/memory_usage", scalar_value=psutil.virtual_memory()._asdict()['used']/1_000_000, global_step=self.n_total_iter)
-        self.tensorboard.add_scalar(tag="global/speed", scalar_value=time.time()-self.last_log, global_step=self.n_total_iter)
+        self.tensorboard.add_scalar(
+            tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter
+        )
+        if self.alpha_mlm > 0.0:
+            self.tensorboard.add_scalar(
+                tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter
+            )
+        if self.alpha_clm > 0.0:
+            self.tensorboard.add_scalar(
+                tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter
+            )
+        if self.alpha_mse > 0.0:
+            self.tensorboard.add_scalar(
+                tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter
+            )
+        if self.alpha_cos > 0.0:
+            self.tensorboard.add_scalar(
+                tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter
+            )
+        self.tensorboard.add_scalar(
+            tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter
+        )
+
+        self.tensorboard.add_scalar(
+            tag="global/memory_usage",
+            scalar_value=psutil.virtual_memory()._asdict()["used"] / 1_000_000,
+            global_step=self.n_total_iter,
+        )
+        self.tensorboard.add_scalar(
+            tag="global/speed", scalar_value=time.time() - self.last_log, global_step=self.n_total_iter
+        )
 
     def end_epoch(self):
         """
         Finally arrived at the end of epoch (full pass on dataset).
         Do some tensorboard logging and checkpoint saving.
         """
-        logger.info(f'{self.n_sequences_epoch} sequences have been trained during this epoch.')
+        logger.info(f"{self.n_sequences_epoch} sequences have been trained during this epoch.")
 
         if self.is_master:
-            self.save_checkpoint(checkpoint_name=f'model_epoch_{self.epoch}.pth')
-            self.tensorboard.add_scalar(tag='epoch/loss', scalar_value=self.total_loss_epoch/self.n_iter, global_step=self.epoch)
+            self.save_checkpoint(checkpoint_name=f"model_epoch_{self.epoch}.pth")
+            self.tensorboard.add_scalar(
+                tag="epoch/loss", scalar_value=self.total_loss_epoch / self.n_iter, global_step=self.epoch
+            )
 
         self.epoch += 1
         self.n_sequences_epoch = 0
         self.n_iter = 0
         self.total_loss_epoch = 0
 
-    def save_checkpoint(self,
-                        checkpoint_name: str = 'checkpoint.pth'):
+    def save_checkpoint(self, checkpoint_name: str = "checkpoint.pth"):
         """
         Save the current state. Only by the master process.
         """
         if not self.is_master:
             return
-        mdl_to_save = self.student.module if hasattr(self.student, 'module') else self.student
+        mdl_to_save = self.student.module if hasattr(self.student, "module") else self.student
         mdl_to_save.config.save_pretrained(self.dump_path)
         state_dict = mdl_to_save.state_dict()
         torch.save(state_dict, os.path.join(self.dump_path, checkpoint_name))
diff --git a/examples/distillation/grouped_batch_sampler.py b/examples/distillation/grouped_batch_sampler.py
index 46d943a3d4..1132fdb582 100644
--- a/examples/distillation/grouped_batch_sampler.py
+++ b/examples/distillation/grouped_batch_sampler.py
@@ -23,12 +23,14 @@ from torch.utils.data.sampler import BatchSampler, Sampler
 
 from utils import logger
 
+
 def _quantize(x, bins):
     bins = copy.deepcopy(bins)
     bins = sorted(bins)
     quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
     return quantized
 
+
 def create_lengths_groups(lengths, k=0):
     bins = np.arange(start=3, stop=k, step=4).tolist() if k > 0 else [10]
     groups = _quantize(lengths, bins)
@@ -39,6 +41,7 @@ def create_lengths_groups(lengths, k=0):
     logger.info("Count of instances per bin: {}".format(counts))
     return groups
 
+
 class GroupedBatchSampler(BatchSampler):
     """
     Wraps another sampler to yield a mini-batch of indices.
@@ -53,11 +56,11 @@ class GroupedBatchSampler(BatchSampler):
             0, i.e. they must be in the range [0, num_groups).
         batch_size (int): Size of mini-batch.
     """
+
     def __init__(self, sampler, group_ids, batch_size):
         if not isinstance(sampler, Sampler):
             raise ValueError(
-                "sampler should be an instance of "
-                "torch.utils.data.Sampler, but got sampler={}".format(sampler)
+                "sampler should be an instance of " "torch.utils.data.Sampler, but got sampler={}".format(sampler)
             )
         self.sampler = sampler
         self.group_ids = group_ids
@@ -73,7 +76,7 @@ class GroupedBatchSampler(BatchSampler):
             buffer_per_group[group_id].append(idx)
             samples_per_group[group_id].append(idx)
             if len(buffer_per_group[group_id]) == self.batch_size:
-                yield buffer_per_group[group_id] #TODO
+                yield buffer_per_group[group_id]  # TODO
                 num_batches += 1
                 del buffer_per_group[group_id]
             assert len(buffer_per_group[group_id]) < self.batch_size
@@ -90,8 +93,8 @@ class GroupedBatchSampler(BatchSampler):
             for group_id, idxs in sorted(buffer_per_group.items(), key=lambda x: x[0]):
                 batch_idx.extend(idxs)
                 if len(batch_idx) >= self.batch_size:
-                    yield batch_idx[:self.batch_size]
-                    batch_idx = batch_idx[self.batch_size:]
+                    yield batch_idx[: self.batch_size]
+                    batch_idx = batch_idx[self.batch_size :]
                     num_remaining -= 1
             if len(batch_idx) > 0:
                 yield batch_idx
diff --git a/examples/distillation/lm_seqs_dataset.py b/examples/distillation/lm_seqs_dataset.py
index 54e9742ce8..bb0d80f38b 100644
--- a/examples/distillation/lm_seqs_dataset.py
+++ b/examples/distillation/lm_seqs_dataset.py
@@ -21,6 +21,7 @@ from torch.utils.data import Dataset
 import numpy as np
 from utils import logger
 
+
 class LmSeqsDataset(Dataset):
     """Custom Dataset wrapping language modeling sequences.
 
@@ -32,9 +33,7 @@ class LmSeqsDataset(Dataset):
         data: `List[np.array[int]]
     """
 
-    def __init__(self,
-                 params,
-                 data):
+    def __init__(self, params, data):
         self.params = params
 
         self.token_ids = np.array(data)
@@ -57,7 +56,7 @@ class LmSeqsDataset(Dataset):
         Some sanity checks
         """
         assert len(self.token_ids) == len(self.lengths)
-        assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths))) 
+        assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths)))
 
     def remove_long_sequences(self):
         """
@@ -65,17 +64,17 @@ class LmSeqsDataset(Dataset):
         """
         max_len = self.params.max_model_input_size
         indices = self.lengths > max_len
-        logger.info(f'Splitting {sum(indices)} too long sequences.')
+        logger.info(f"Splitting {sum(indices)} too long sequences.")
 
         def divide_chunks(l, n):
-            return [l[i:i + n] for i in range(0, len(l), n)]
+            return [l[i : i + n] for i in range(0, len(l), n)]
 
         new_tok_ids = []
         new_lengths = []
         if self.params.mlm:
-            cls_id, sep_id = self.params.special_tok_ids['cls_token'], self.params.special_tok_ids['sep_token']
+            cls_id, sep_id = self.params.special_tok_ids["cls_token"], self.params.special_tok_ids["sep_token"]
         else:
-            cls_id, sep_id = self.params.special_tok_ids['bos_token'], self.params.special_tok_ids['eos_token']
+            cls_id, sep_id = self.params.special_tok_ids["bos_token"], self.params.special_tok_ids["eos_token"]
 
         for seq_, len_ in zip(self.token_ids, self.lengths):
             assert (seq_[0] == cls_id) and (seq_[-1] == sep_id), seq_
@@ -84,7 +83,7 @@ class LmSeqsDataset(Dataset):
                 new_lengths.append(len_)
             else:
                 sub_seqs = []
-                for sub_s in divide_chunks(seq_, max_len-2):
+                for sub_s in divide_chunks(seq_, max_len - 2):
                     if sub_s[0] != cls_id:
                         sub_s = np.insert(sub_s, 0, cls_id)
                     if sub_s[-1] != sep_id:
@@ -108,7 +107,7 @@ class LmSeqsDataset(Dataset):
         self.token_ids = self.token_ids[indices]
         self.lengths = self.lengths[indices]
         new_size = len(self)
-        logger.info(f'Remove {init_size - new_size} too short (<=11 tokens) sequences.')
+        logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.")
 
     def print_statistics(self):
         """
@@ -116,7 +115,7 @@ class LmSeqsDataset(Dataset):
         """
         if not self.params.is_master:
             return
-        logger.info(f'{len(self)} sequences')
+        logger.info(f"{len(self)} sequences")
         # data_len = sum(self.lengths)
         # nb_unique_tokens = len(Counter(list(chain(*self.token_ids))))
         # logger.info(f'{data_len} tokens ({nb_unique_tokens} unique)')
@@ -125,8 +124,7 @@ class LmSeqsDataset(Dataset):
         # nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
         # logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')
 
-    def batch_sequences(self,
-                        batch):
+    def batch_sequences(self, batch):
         """
         Do the padding and transform into torch.tensor.
         """
@@ -139,13 +137,13 @@ class LmSeqsDataset(Dataset):
 
         # Pad token ids
         if self.params.mlm:
-            pad_idx = self.params.special_tok_ids['pad_token']
+            pad_idx = self.params.special_tok_ids["pad_token"]
         else:
-            pad_idx = self.params.special_tok_ids['unk_token']
-        tk_ = [list(t.astype(int)) + [pad_idx]*(max_seq_len_-len(t)) for t in token_ids]
+            pad_idx = self.params.special_tok_ids["unk_token"]
+        tk_ = [list(t.astype(int)) + [pad_idx] * (max_seq_len_ - len(t)) for t in token_ids]
         assert len(tk_) == len(token_ids)
         assert all(len(t) == max_seq_len_ for t in tk_)
 
-        tk_t = torch.tensor(tk_)      # (bs, max_seq_len_)
+        tk_t = torch.tensor(tk_)  # (bs, max_seq_len_)
         lg_t = torch.tensor(lengths)  # (bs)
         return tk_t, lg_t
diff --git a/examples/distillation/run_squad_w_distillation.py b/examples/distillation/run_squad_w_distillation.py
index 70b65dc1b8..0d5a004eb3 100644
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@@ -25,8 +25,7 @@ import glob
 
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 import torch.nn.functional as F
 import torch.nn as nn
@@ -38,19 +37,32 @@ except:
 
 from tqdm import tqdm, trange
 
-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForQuestionAnswering, BertTokenizer,
-                                  XLMConfig, XLMForQuestionAnswering,
-                                  XLMTokenizer, XLNetConfig,
-                                  XLNetForQuestionAnswering,
-                                  XLNetTokenizer,
-                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForQuestionAnswering,
+    BertTokenizer,
+    XLMConfig,
+    XLMForQuestionAnswering,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetForQuestionAnswering,
+    XLNetTokenizer,
+    DistilBertConfig,
+    DistilBertForQuestionAnswering,
+    DistilBertTokenizer,
+)
 
 from transformers import AdamW, get_linear_schedule_with_warmup
 
-from ..utils_squad import (read_squad_examples, convert_examples_to_features,
-                         RawResult, write_predictions,
-                         RawResultExtended, write_predictions_extended)
+from ..utils_squad import (
+    read_squad_examples,
+    convert_examples_to_features,
+    RawResult,
+    write_predictions,
+    RawResultExtended,
+    write_predictions_extended,
+)
 
 # The follwing import is the official SQuAD evaluation script (2.0).
 # You can remove it from the dependencies if you are using this script outside of the library
@@ -59,16 +71,18 @@ from ..utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
-                  for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), ()
+)
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
-    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
-    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+    "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
+    "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
 }
 
+
 def set_seed(args):
     random.seed(args.seed)
     np.random.seed(args.seed)
@@ -76,9 +90,11 @@ def set_seed(args):
     if args.n_gpu > 0:
         torch.cuda.manual_seed_all(args.seed)
 
+
 def to_list(tensor):
     return tensor.detach().cpu().tolist()
 
+
 def train(args, train_dataset, model, tokenizer, teacher=None):
     """ Train the model """
     if args.local_rank in [-1, 0]:
@@ -95,13 +111,18 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
     if args.fp16:
         try:
             from apex import amp
@@ -115,17 +136,21 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -141,40 +166,47 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
             if teacher is not None:
                 teacher.eval()
             batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':       batch[0],
-                      'attention_mask':  batch[1], 
-                      'start_positions': batch[3], 
-                      'end_positions':   batch[4]}
-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
-            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[5],
-                               'p_mask':       batch[6]})
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                "start_positions": batch[3],
+                "end_positions": batch[4],
+            }
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
             outputs = model(**inputs)
             loss, start_logits_stu, end_logits_stu = outputs
 
             # Distillation loss
             if teacher is not None:
-                if 'token_type_ids' not in inputs:
-                    inputs['token_type_ids'] = None if args.teacher_type == 'xlm' else batch[2]
+                if "token_type_ids" not in inputs:
+                    inputs["token_type_ids"] = None if args.teacher_type == "xlm" else batch[2]
                 with torch.no_grad():
-                    start_logits_tea, end_logits_tea = teacher(input_ids=inputs['input_ids'],
-                                                               token_type_ids=inputs['token_type_ids'],
-                                                               attention_mask=inputs['attention_mask'])
+                    start_logits_tea, end_logits_tea = teacher(
+                        input_ids=inputs["input_ids"],
+                        token_type_ids=inputs["token_type_ids"],
+                        attention_mask=inputs["attention_mask"],
+                    )
                 assert start_logits_tea.size() == start_logits_stu.size()
                 assert end_logits_tea.size() == end_logits_stu.size()
-                
-                loss_fct = nn.KLDivLoss(reduction='batchmean')
-                loss_start = loss_fct(F.log_softmax(start_logits_stu/args.temperature, dim=-1),
-                                      F.softmax(start_logits_tea/args.temperature, dim=-1)) * (args.temperature**2)
-                loss_end = loss_fct(F.log_softmax(end_logits_stu/args.temperature, dim=-1),
-                                    F.softmax(end_logits_tea/args.temperature, dim=-1)) * (args.temperature**2)
-                loss_ce = (loss_start + loss_end)/2.
 
-                loss = args.alpha_ce*loss_ce + args.alpha_squad*loss
+                loss_fct = nn.KLDivLoss(reduction="batchmean")
+                loss_start = loss_fct(
+                    F.log_softmax(start_logits_stu / args.temperature, dim=-1),
+                    F.softmax(start_logits_tea / args.temperature, dim=-1),
+                ) * (args.temperature ** 2)
+                loss_end = loss_fct(
+                    F.log_softmax(end_logits_stu / args.temperature, dim=-1),
+                    F.softmax(end_logits_tea / args.temperature, dim=-1),
+                ) * (args.temperature ** 2)
+                loss_ce = (loss_start + loss_end) / 2.0
+
+                loss = args.alpha_ce * loss_ce + args.alpha_squad * loss
 
             if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
             if args.gradient_accumulation_steps > 1:
                 loss = loss / args.gradient_accumulation_steps
 
@@ -195,22 +227,26 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                     logging_loss = tr_loss
 
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                     # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
@@ -246,32 +282,31 @@ def evaluate(args, model, tokenizer, prefix=""):
         model.eval()
         batch = tuple(t.to(args.device) for t in batch)
         with torch.no_grad():
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1]
-                      }
-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]  # XLM don't use segment_ids
             example_indices = batch[3]
-            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[4],
-                               'p_mask':    batch[5]})
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
             outputs = model(**inputs)
 
         for i, example_index in enumerate(example_indices):
             eval_feature = features[example_index.item()]
             unique_id = int(eval_feature.unique_id)
-            if args.model_type in ['xlnet', 'xlm']:
+            if args.model_type in ["xlnet", "xlm"]:
                 # XLNet uses a more complex post-processing procedure
-                result = RawResultExtended(unique_id            = unique_id,
-                                           start_top_log_probs  = to_list(outputs[0][i]),
-                                           start_top_index      = to_list(outputs[1][i]),
-                                           end_top_log_probs    = to_list(outputs[2][i]),
-                                           end_top_index        = to_list(outputs[3][i]),
-                                           cls_logits           = to_list(outputs[4][i]))
+                result = RawResultExtended(
+                    unique_id=unique_id,
+                    start_top_log_probs=to_list(outputs[0][i]),
+                    start_top_index=to_list(outputs[1][i]),
+                    end_top_log_probs=to_list(outputs[2][i]),
+                    end_top_index=to_list(outputs[3][i]),
+                    cls_logits=to_list(outputs[4][i]),
+                )
             else:
-                result = RawResult(unique_id    = unique_id,
-                                   start_logits = to_list(outputs[0][i]),
-                                   end_logits   = to_list(outputs[1][i]))
+                result = RawResult(
+                    unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i])
+                )
             all_results.append(result)
 
     # Compute predictions
@@ -282,23 +317,44 @@ def evaluate(args, model, tokenizer, prefix=""):
     else:
         output_null_log_odds_file = None
 
-    if args.model_type in ['xlnet', 'xlm']:
+    if args.model_type in ["xlnet", "xlm"]:
         # XLNet uses a more complex post-processing procedure
-        write_predictions_extended(examples, features, all_results, args.n_best_size,
-                        args.max_answer_length, output_prediction_file,
-                        output_nbest_file, output_null_log_odds_file, args.predict_file,
-                        model.config.start_n_top, model.config.end_n_top,
-                        args.version_2_with_negative, tokenizer, args.verbose_logging)
+        write_predictions_extended(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            args.predict_file,
+            model.config.start_n_top,
+            model.config.end_n_top,
+            args.version_2_with_negative,
+            tokenizer,
+            args.verbose_logging,
+        )
     else:
-        write_predictions(examples, features, all_results, args.n_best_size,
-                        args.max_answer_length, args.do_lower_case, output_prediction_file,
-                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
-                        args.version_2_with_negative, args.null_score_diff_threshold)
+        write_predictions(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            args.do_lower_case,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            args.verbose_logging,
+            args.version_2_with_negative,
+            args.null_score_diff_threshold,
+        )
 
     # Evaluate with the official SQuAD script
-    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
-                                 pred_file=output_prediction_file,
-                                 na_prob_file=output_null_log_odds_file)
+    evaluate_options = EVAL_OPTS(
+        data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file
+    )
     results = evaluate_on_squad(evaluate_options)
     return results
 
@@ -309,24 +365,30 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
 
     # Load data features from cache or dataset file
     input_file = args.predict_file if evaluate else args.train_file
-    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
-        'dev' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length)))
+    cached_features_file = os.path.join(
+        os.path.dirname(input_file),
+        "cached_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+        ),
+    )
     if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", input_file)
-        examples = read_squad_examples(input_file=input_file,
-                                                is_training=not evaluate,
-                                                version_2_with_negative=args.version_2_with_negative)
-        features = convert_examples_to_features(examples=examples,
-                                                tokenizer=tokenizer,
-                                                max_seq_length=args.max_seq_length,
-                                                doc_stride=args.doc_stride,
-                                                max_query_length=args.max_query_length,
-                                                is_training=not evaluate)
+        examples = read_squad_examples(
+            input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative
+        )
+        features = convert_examples_to_features(
+            examples=examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=not evaluate,
+        )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
@@ -342,14 +404,21 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
     all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
     if evaluate:
         all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_example_index, all_cls_index, all_p_mask)
+        dataset = TensorDataset(
+            all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask
+        )
     else:
         all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
         all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_start_positions, all_end_positions,
-                                all_cls_index, all_p_mask)
+        dataset = TensorDataset(
+            all_input_ids,
+            all_input_mask,
+            all_segment_ids,
+            all_start_positions,
+            all_end_positions,
+            all_cls_index,
+            all_p_mask,
+        )
 
     if output_examples:
         return dataset, examples, features
@@ -360,121 +429,213 @@ def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--train_file", default=None, type=str, required=True,
-                        help="SQuAD json for training. E.g., train-v1.1.json")
-    parser.add_argument("--predict_file", default=None, type=str, required=True,
-                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model checkpoints and predictions will be written.")
+    parser.add_argument(
+        "--train_file", default=None, type=str, required=True, help="SQuAD json for training. E.g., train-v1.1.json"
+    )
+    parser.add_argument(
+        "--predict_file",
+        default=None,
+        type=str,
+        required=True,
+        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model checkpoints and predictions will be written.",
+    )
 
     # Distillation parameters (optional)
-    parser.add_argument('--teacher_type', default=None, type=str,
-                        help="Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for distillation.")
-    parser.add_argument('--teacher_name_or_path', default=None, type=str,
-                        help="Path to the already SQuAD fine-tuned teacher model. Only for distillation.")
-    parser.add_argument('--alpha_ce', default=0.5, type=float,
-                        help="Distillation loss linear weight. Only for distillation.")
-    parser.add_argument('--alpha_squad', default=0.5, type=float,
-                        help="True SQuAD loss linear weight. Only for distillation.")
-    parser.add_argument('--temperature', default=2.0, type=float,
-                        help="Distillation temperature. Only for distillation.")
+    parser.add_argument(
+        "--teacher_type",
+        default=None,
+        type=str,
+        help="Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for distillation.",
+    )
+    parser.add_argument(
+        "--teacher_name_or_path",
+        default=None,
+        type=str,
+        help="Path to the already SQuAD fine-tuned teacher model. Only for distillation.",
+    )
+    parser.add_argument(
+        "--alpha_ce", default=0.5, type=float, help="Distillation loss linear weight. Only for distillation."
+    )
+    parser.add_argument(
+        "--alpha_squad", default=0.5, type=float, help="True SQuAD loss linear weight. Only for distillation."
+    )
+    parser.add_argument(
+        "--temperature", default=2.0, type=float, help="Distillation temperature. Only for distillation."
+    )
 
     ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
 
-    parser.add_argument('--version_2_with_negative', action='store_true',
-                        help='If true, the SQuAD examples contain some that do not have an answer.')
-    parser.add_argument('--null_score_diff_threshold', type=float, default=0.0,
-                        help="If null_score - best_non_null is greater than the threshold predict null.")
+    parser.add_argument(
+        "--version_2_with_negative",
+        action="store_true",
+        help="If true, the SQuAD examples contain some that do not have an answer.",
+    )
+    parser.add_argument(
+        "--null_score_diff_threshold",
+        type=float,
+        default=0.0,
+        help="If null_score - best_non_null is greater than the threshold predict null.",
+    )
 
-    parser.add_argument("--max_seq_length", default=384, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
-                             "longer than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--doc_stride", default=128, type=int,
-                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
-    parser.add_argument("--max_query_length", default=64, type=int,
-                        help="The maximum number of tokens for the question. Questions longer than this will "
-                             "be truncated to this length.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--max_seq_length",
+        default=384,
+        type=int,
+        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+        "longer than this will be truncated, and sequences shorter than this will be padded.",
+    )
+    parser.add_argument(
+        "--doc_stride",
+        default=128,
+        type=int,
+        help="When splitting up a long document into chunks, how much stride to take between chunks.",
+    )
+    parser.add_argument(
+        "--max_query_length",
+        default=64,
+        type=int,
+        help="The maximum number of tokens for the question. Questions longer than this will "
+        "be truncated to this length.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
 
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-    parser.add_argument("--n_best_size", default=20, type=int,
-                        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
-    parser.add_argument("--max_answer_length", default=30, type=int,
-                        help="The maximum length of an answer that can be generated. This is needed because the start "
-                             "and end predictions are not conditioned on one another.")
-    parser.add_argument("--verbose_logging", action='store_true',
-                        help="If true, all of the warnings related to data processing will be printed. "
-                             "A number of warnings are expected for a normal SQuAD evaluation.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+    parser.add_argument(
+        "--n_best_size",
+        default=20,
+        type=int,
+        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
+    )
+    parser.add_argument(
+        "--max_answer_length",
+        default=30,
+        type=int,
+        help="The maximum length of an answer that can be generated. This is needed because the start "
+        "and end predictions are not conditioned on one another.",
+    )
+    parser.add_argument(
+        "--verbose_logging",
+        action="store_true",
+        help="If true, all of the warnings related to data processing will be printed. "
+        "A number of warnings are expected for a normal SQuAD evaluation.",
+    )
 
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
 
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
     args = parser.parse_args()
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -486,16 +647,24 @@ def main():
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
         args.n_gpu = 1
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
@@ -506,27 +675,34 @@ def main():
 
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
 
     if args.teacher_type is not None:
         assert args.teacher_name_or_path is not None
-        assert args.alpha_ce > 0.
-        assert args.alpha_ce + args.alpha_squad > 0.
-        assert args.teacher_type != 'distilbert', "We constraint teachers not to be of type DistilBERT."
+        assert args.alpha_ce > 0.0
+        assert args.alpha_ce + args.alpha_squad > 0.0
+        assert args.teacher_type != "distilbert", "We constraint teachers not to be of type DistilBERT."
         teacher_config_class, teacher_model_class, _ = MODEL_CLASSES[args.teacher_type]
-        teacher_config = teacher_config_class.from_pretrained(args.teacher_name_or_path,
-                                                              cache_dir=args.cache_dir if args.cache_dir else None)
-        teacher = teacher_model_class.from_pretrained(args.teacher_name_or_path,
-                                                      config=teacher_config,
-                                                      cache_dir=args.cache_dir if args.cache_dir else None)
+        teacher_config = teacher_config_class.from_pretrained(
+            args.teacher_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None
+        )
+        teacher = teacher_model_class.from_pretrained(
+            args.teacher_name_or_path, config=teacher_config, cache_dir=args.cache_dir if args.cache_dir else None
+        )
         teacher.to(args.device)
     else:
         teacher = None
@@ -544,7 +720,6 @@ def main():
         global_step, tr_loss = train(args, train_dataset, model, tokenizer, teacher=teacher)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-
     # Save the trained model and the tokenizer
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
@@ -554,41 +729,44 @@ def main():
         logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir, cache_dir=args.cache_dir if args.cache_dir else None)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir,
-                                                    do_lower_case=args.do_lower_case,
-                                                    cache_dir=args.cache_dir if args.cache_dir else None)
+        tokenizer = tokenizer_class.from_pretrained(
+            args.output_dir, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None
+        )
         model.to(args.device)
 
-
     # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
 
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
 
         for checkpoint in checkpoints:
             # Reload the model
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
             model = model_class.from_pretrained(checkpoint, cache_dir=args.cache_dir if args.cache_dir else None)
             model.to(args.device)
 
             # Evaluate
             result = evaluate(args, model, tokenizer, prefix=global_step)
 
-            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
+            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
             results.update(result)
 
     logger.info("Results: {}".format(results))
diff --git a/examples/distillation/scripts/binarized_data.py b/examples/distillation/scripts/binarized_data.py
index 681cc2de34..40bde7d152 100644
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -23,68 +23,65 @@ import numpy as np
 from transformers import BertTokenizer, RobertaTokenizer, GPT2Tokenizer
 import logging
 
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+)
 logger = logging.getLogger(__name__)
 
+
 def main():
-    parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
-    parser.add_argument('--file_path', type=str, default='data/dump.txt',
-                        help='The path to the data.')
-    parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2'])
-    parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased',
-                        help="The tokenizer to use.")
-    parser.add_argument('--dump_file', type=str, default='data/dump',
-                        help='The dump file prefix.')
+    parser = argparse.ArgumentParser(
+        description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids)."
+    )
+    parser.add_argument("--file_path", type=str, default="data/dump.txt", help="The path to the data.")
+    parser.add_argument("--tokenizer_type", type=str, default="bert", choices=["bert", "roberta", "gpt2"])
+    parser.add_argument("--tokenizer_name", type=str, default="bert-base-uncased", help="The tokenizer to use.")
+    parser.add_argument("--dump_file", type=str, default="data/dump", help="The dump file prefix.")
     args = parser.parse_args()
 
-
-    logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
-    if args.tokenizer_type == 'bert':
+    logger.info(f"Loading Tokenizer ({args.tokenizer_name})")
+    if args.tokenizer_type == "bert":
         tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
-        bos = tokenizer.special_tokens_map['cls_token'] # `[CLS]`
-        sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]`
-    elif args.tokenizer_type == 'roberta':
+        bos = tokenizer.special_tokens_map["cls_token"]  # `[CLS]`
+        sep = tokenizer.special_tokens_map["sep_token"]  # `[SEP]`
+    elif args.tokenizer_type == "roberta":
         tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
-        bos = tokenizer.special_tokens_map['cls_token'] # `<s>`
-        sep = tokenizer.special_tokens_map['sep_token'] # `</s>`
-    elif args.tokenizer_type == 'gpt2':
+        bos = tokenizer.special_tokens_map["cls_token"]  # `<s>`
+        sep = tokenizer.special_tokens_map["sep_token"]  # `</s>`
+    elif args.tokenizer_type == "gpt2":
         tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
-        bos = tokenizer.special_tokens_map['bos_token'] # `<|endoftext|>`
-        sep = tokenizer.special_tokens_map['eos_token'] # `<|endoftext|>`    
+        bos = tokenizer.special_tokens_map["bos_token"]  # `<|endoftext|>`
+        sep = tokenizer.special_tokens_map["eos_token"]  # `<|endoftext|>`
 
-    logger.info(f'Loading text from {args.file_path}')
-    with open(args.file_path, 'r', encoding='utf8') as fp:
+    logger.info(f"Loading text from {args.file_path}")
+    with open(args.file_path, "r", encoding="utf8") as fp:
         data = fp.readlines()
 
-
-    logger.info(f'Start encoding')
-    logger.info(f'{len(data)} examples to process.')
+    logger.info(f"Start encoding")
+    logger.info(f"{len(data)} examples to process.")
 
     rslt = []
     iter = 0
     interval = 10000
     start = time.time()
     for text in data:
-        text = f'{bos} {text.strip()} {sep}'
+        text = f"{bos} {text.strip()} {sep}"
         token_ids = tokenizer.encode(text, add_special_tokens=False)
         rslt.append(token_ids)
 
         iter += 1
         if iter % interval == 0:
             end = time.time()
-            logger.info(f'{iter} examples processed. - {(end-start)/interval:.2f}s/expl')
+            logger.info(f"{iter} examples processed. - {(end-start)/interval:.2f}s/expl")
             start = time.time()
-    logger.info('Finished binarization')
-    logger.info(f'{len(data)} examples processed.')
+    logger.info("Finished binarization")
+    logger.info(f"{len(data)} examples processed.")
 
-
-    dp_file = f'{args.dump_file}.{args.tokenizer_name}.pickle'
+    dp_file = f"{args.dump_file}.{args.tokenizer_name}.pickle"
     rslt_ = [np.uint16(d) for d in rslt]
     random.shuffle(rslt_)
-    logger.info(f'Dump to {dp_file}')
-    with open(dp_file, 'wb') as handle:
+    logger.info(f"Dump to {dp_file}")
+    with open(dp_file, "wb") as handle:
         pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)
 
 
diff --git a/examples/distillation/scripts/extract.py b/examples/distillation/scripts/extract.py
index 5ae1607f3f..9610f8f17a 100644
--- a/examples/distillation/scripts/extract.py
+++ b/examples/distillation/scripts/extract.py
@@ -20,70 +20,80 @@ from transformers import BertForMaskedLM, RobertaForMaskedLM, GPT2LMHeadModel
 import torch
 import argparse
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation"
+    )
     parser.add_argument("--model_type", default="roberta", choices=["roberta", "gpt2"])
-    parser.add_argument("--model_name", default='roberta-large', type=str)
-    parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_roberta_048131723.pth', type=str)
-    parser.add_argument("--vocab_transform", action='store_true')
+    parser.add_argument("--model_name", default="roberta-large", type=str)
+    parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_roberta_048131723.pth", type=str)
+    parser.add_argument("--vocab_transform", action="store_true")
     args = parser.parse_args()
 
-
-    if args.model_type == 'roberta':
+    if args.model_type == "roberta":
         model = RobertaForMaskedLM.from_pretrained(args.model_name)
-        prefix = 'roberta'
-    elif args.model_type == 'gpt2':
+        prefix = "roberta"
+    elif args.model_type == "gpt2":
         model = GPT2LMHeadModel.from_pretrained(args.model_name)
-        prefix = 'transformer'
+        prefix = "transformer"
 
     state_dict = model.state_dict()
     compressed_sd = {}
 
     ### Embeddings ###
-    if args.model_type == 'gpt2':
-        for param_name in ['wte.weight', 'wpe.weight']:
-            compressed_sd[f'{prefix}.{param_name}'] = state_dict[f'{prefix}.{param_name}']
+    if args.model_type == "gpt2":
+        for param_name in ["wte.weight", "wpe.weight"]:
+            compressed_sd[f"{prefix}.{param_name}"] = state_dict[f"{prefix}.{param_name}"]
     else:
-        for w in ['word_embeddings', 'position_embeddings', 'token_type_embeddings']:
-            param_name = f'{prefix}.embeddings.{w}.weight'
+        for w in ["word_embeddings", "position_embeddings", "token_type_embeddings"]:
+            param_name = f"{prefix}.embeddings.{w}.weight"
             compressed_sd[param_name] = state_dict[param_name]
-        for w in ['weight', 'bias']:
-            param_name = f'{prefix}.embeddings.LayerNorm.{w}'
+        for w in ["weight", "bias"]:
+            param_name = f"{prefix}.embeddings.LayerNorm.{w}"
             compressed_sd[param_name] = state_dict[param_name]
 
     ### Transformer Blocks ###
     std_idx = 0
     for teacher_idx in [0, 2, 4, 7, 9, 11]:
-        if args.model_type == 'gpt2':
-            for layer in ['ln_1', 'attn.c_attn', 'attn.c_proj', 'ln_2', 'mlp.c_fc', 'mlp.c_proj']:
-                for w in ['weight', 'bias']:
-                    compressed_sd[f'{prefix}.h.{std_idx}.{layer}.{w}'] = \
-                        state_dict[f'{prefix}.h.{teacher_idx}.{layer}.{w}']
-            compressed_sd[f'{prefix}.h.{std_idx}.attn.bias'] = state_dict[f'{prefix}.h.{teacher_idx}.attn.bias']
+        if args.model_type == "gpt2":
+            for layer in ["ln_1", "attn.c_attn", "attn.c_proj", "ln_2", "mlp.c_fc", "mlp.c_proj"]:
+                for w in ["weight", "bias"]:
+                    compressed_sd[f"{prefix}.h.{std_idx}.{layer}.{w}"] = state_dict[
+                        f"{prefix}.h.{teacher_idx}.{layer}.{w}"
+                    ]
+            compressed_sd[f"{prefix}.h.{std_idx}.attn.bias"] = state_dict[f"{prefix}.h.{teacher_idx}.attn.bias"]
         else:
-            for layer in ['attention.self.query', 'attention.self.key', 'attention.self.value',
-                        'attention.output.dense', 'attention.output.LayerNorm',
-                        'intermediate.dense', 'output.dense', 'output.LayerNorm']:
-                for w in ['weight', 'bias']:
-                    compressed_sd[f'{prefix}.encoder.layer.{std_idx}.{layer}.{w}'] = \
-                        state_dict[f'{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}']
+            for layer in [
+                "attention.self.query",
+                "attention.self.key",
+                "attention.self.value",
+                "attention.output.dense",
+                "attention.output.LayerNorm",
+                "intermediate.dense",
+                "output.dense",
+                "output.LayerNorm",
+            ]:
+                for w in ["weight", "bias"]:
+                    compressed_sd[f"{prefix}.encoder.layer.{std_idx}.{layer}.{w}"] = state_dict[
+                        f"{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}"
+                    ]
         std_idx += 1
 
     ### Language Modeling Head ###s
-    if args.model_type == 'roberta':
-        for layer in ['lm_head.decoder.weight', 'lm_head.bias']:
-            compressed_sd[f'{layer}'] = state_dict[f'{layer}']
+    if args.model_type == "roberta":
+        for layer in ["lm_head.decoder.weight", "lm_head.bias"]:
+            compressed_sd[f"{layer}"] = state_dict[f"{layer}"]
         if args.vocab_transform:
-            for w in ['weight', 'bias']:
-                compressed_sd[f'lm_head.dense.{w}'] = state_dict[f'lm_head.dense.{w}']
-                compressed_sd[f'lm_head.layer_norm.{w}'] = state_dict[f'lm_head.layer_norm.{w}']
-    elif args.model_type == 'gpt2':
-        for w in ['weight', 'bias']:
-            compressed_sd[f'{prefix}.ln_f.{w}'] = state_dict[f'{prefix}.ln_f.{w}']
-        compressed_sd[f'lm_head.weight'] = state_dict[f'lm_head.weight']
+            for w in ["weight", "bias"]:
+                compressed_sd[f"lm_head.dense.{w}"] = state_dict[f"lm_head.dense.{w}"]
+                compressed_sd[f"lm_head.layer_norm.{w}"] = state_dict[f"lm_head.layer_norm.{w}"]
+    elif args.model_type == "gpt2":
+        for w in ["weight", "bias"]:
+            compressed_sd[f"{prefix}.ln_f.{w}"] = state_dict[f"{prefix}.ln_f.{w}"]
+        compressed_sd[f"lm_head.weight"] = state_dict[f"lm_head.weight"]
 
-    print(f'N layers selected for distillation: {std_idx}')
-    print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
+    print(f"N layers selected for distillation: {std_idx}")
+    print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
 
-    print(f'Save transfered checkpoint to {args.dump_checkpoint}.')
+    print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
     torch.save(compressed_sd, args.dump_checkpoint)
diff --git a/examples/distillation/scripts/extract_distilbert.py b/examples/distillation/scripts/extract_distilbert.py
index fdb0662ca7..8e58db5552 100644
--- a/examples/distillation/scripts/extract_distilbert.py
+++ b/examples/distillation/scripts/extract_distilbert.py
@@ -20,63 +20,70 @@ from transformers import BertForMaskedLM, RobertaForMaskedLM
 import torch
 import argparse
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation"
+    )
     parser.add_argument("--model_type", default="bert", choices=["bert"])
-    parser.add_argument("--model_name", default='bert-base-uncased', type=str)
-    parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_bert-base-uncased_0247911.pth', type=str)
-    parser.add_argument("--vocab_transform", action='store_true')
+    parser.add_argument("--model_name", default="bert-base-uncased", type=str)
+    parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_bert-base-uncased_0247911.pth", type=str)
+    parser.add_argument("--vocab_transform", action="store_true")
     args = parser.parse_args()
 
-
-    if args.model_type == 'bert':
+    if args.model_type == "bert":
         model = BertForMaskedLM.from_pretrained(args.model_name)
-        prefix = 'bert'
+        prefix = "bert"
     else:
         raise ValueError(f'args.model_type should be "bert".')
 
     state_dict = model.state_dict()
     compressed_sd = {}
 
-    for w in ['word_embeddings', 'position_embeddings']:
-        compressed_sd[f'distilbert.embeddings.{w}.weight'] = \
-            state_dict[f'{prefix}.embeddings.{w}.weight']
-    for w in ['weight', 'bias']:
-        compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \
-            state_dict[f'{prefix}.embeddings.LayerNorm.{w}']
+    for w in ["word_embeddings", "position_embeddings"]:
+        compressed_sd[f"distilbert.embeddings.{w}.weight"] = state_dict[f"{prefix}.embeddings.{w}.weight"]
+    for w in ["weight", "bias"]:
+        compressed_sd[f"distilbert.embeddings.LayerNorm.{w}"] = state_dict[f"{prefix}.embeddings.LayerNorm.{w}"]
 
     std_idx = 0
     for teacher_idx in [0, 2, 4, 7, 9, 11]:
-        for w in ['weight', 'bias']:
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}']
+        for w in ["weight", "bias"]:
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}"
+            ]
 
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}']
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}"
+            ]
 
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}"
+            ]
         std_idx += 1
 
-    compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
-    compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
+    compressed_sd[f"vocab_projector.weight"] = state_dict[f"cls.predictions.decoder.weight"]
+    compressed_sd[f"vocab_projector.bias"] = state_dict[f"cls.predictions.bias"]
     if args.vocab_transform:
-        for w in ['weight', 'bias']:
-            compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
-            compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
+        for w in ["weight", "bias"]:
+            compressed_sd[f"vocab_transform.{w}"] = state_dict[f"cls.predictions.transform.dense.{w}"]
+            compressed_sd[f"vocab_layer_norm.{w}"] = state_dict[f"cls.predictions.transform.LayerNorm.{w}"]
 
-    print(f'N layers selected for distillation: {std_idx}')
-    print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
+    print(f"N layers selected for distillation: {std_idx}")
+    print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
 
-    print(f'Save transfered checkpoint to {args.dump_checkpoint}.')
+    print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
     torch.save(compressed_sd, args.dump_checkpoint)
diff --git a/examples/distillation/scripts/token_counts.py b/examples/distillation/scripts/token_counts.py
index d9de17da4e..623caad4b1 100644
--- a/examples/distillation/scripts/token_counts.py
+++ b/examples/distillation/scripts/token_counts.py
@@ -20,32 +20,36 @@ import argparse
 import pickle
 import logging
 
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+)
 logger = logging.getLogger(__name__)
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)")
-    parser.add_argument("--data_file", type=str, default="data/dump.bert-base-uncased.pickle",
-                        help="The binarized dataset.")
-    parser.add_argument("--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle",
-                        help="The dump file.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)"
+    )
+    parser.add_argument(
+        "--data_file", type=str, default="data/dump.bert-base-uncased.pickle", help="The binarized dataset."
+    )
+    parser.add_argument(
+        "--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle", help="The dump file."
+    )
     parser.add_argument("--vocab_size", default=30522, type=int)
     args = parser.parse_args()
 
-    logger.info(f'Loading data from {args.data_file}')
-    with open(args.data_file, 'rb') as fp:
+    logger.info(f"Loading data from {args.data_file}")
+    with open(args.data_file, "rb") as fp:
         data = pickle.load(fp)
 
-    logger.info('Counting occurences for MLM.')
+    logger.info("Counting occurences for MLM.")
     counter = Counter()
     for tk_ids in data:
         counter.update(tk_ids)
-    counts = [0]*args.vocab_size
+    counts = [0] * args.vocab_size
     for k, v in counter.items():
         counts[k] = v
 
-    logger.info(f'Dump to {args.token_counts_dump}')
-    with open(args.token_counts_dump, 'wb') as handle:
+    logger.info(f"Dump to {args.token_counts_dump}")
+    with open(args.token_counts_dump, "wb") as handle:
         pickle.dump(counts, handle, protocol=pickle.HIGHEST_PROTOCOL)
diff --git a/examples/distillation/train.py b/examples/distillation/train.py
index 311f0580ff..37c49ae7b2 100644
--- a/examples/distillation/train.py
+++ b/examples/distillation/train.py
@@ -35,166 +35,200 @@ from lm_seqs_dataset import LmSeqsDataset
 
 
 MODEL_CLASSES = {
-    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
-    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
-    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
-    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)
+    "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
+    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
+    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
+    "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
 }
 
+
 def sanity_checks(args):
     """
     A bunch of args sanity checks to perform even starting...
     """
-    assert (args.mlm and args.alpha_mlm > 0.) or (not args.mlm and args.alpha_mlm == 0.)
-    assert (args.alpha_mlm > 0. and args.alpha_clm == 0.) or (args.alpha_mlm == 0. and args.alpha_clm > 0.)
+    assert (args.mlm and args.alpha_mlm > 0.0) or (not args.mlm and args.alpha_mlm == 0.0)
+    assert (args.alpha_mlm > 0.0 and args.alpha_clm == 0.0) or (args.alpha_mlm == 0.0 and args.alpha_clm > 0.0)
     if args.mlm:
         assert os.path.isfile(args.token_counts)
-        assert (args.student_type in ['roberta', 'distilbert']) and (args.teacher_type in ['roberta', 'bert'])
+        assert (args.student_type in ["roberta", "distilbert"]) and (args.teacher_type in ["roberta", "bert"])
     else:
-        assert (args.student_type in ['gpt2']) and (args.teacher_type in ['gpt2'])
+        assert (args.student_type in ["gpt2"]) and (args.teacher_type in ["gpt2"])
 
-    assert args.teacher_type == args.student_type or (args.student_type=='distilbert' and args.teacher_type=='bert')
+    assert args.teacher_type == args.student_type or (
+        args.student_type == "distilbert" and args.teacher_type == "bert"
+    )
     assert os.path.isfile(args.student_config)
     if args.student_pretrained_weights is not None:
         assert os.path.isfile(args.student_pretrained_weights)
 
-    if args.freeze_token_type_embds: assert args.student_type in ['roberta']
+    if args.freeze_token_type_embds:
+        assert args.student_type in ["roberta"]
+
+    assert args.alpha_ce >= 0.0
+    assert args.alpha_mlm >= 0.0
+    assert args.alpha_clm >= 0.0
+    assert args.alpha_mse >= 0.0
+    assert args.alpha_cos >= 0.0
+    assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0.0
 
-    assert args.alpha_ce >= 0.
-    assert args.alpha_mlm >= 0.
-    assert args.alpha_clm >= 0.
-    assert args.alpha_mse >= 0.
-    assert args.alpha_cos >= 0.
-    assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0.
 
 def freeze_pos_embeddings(student, args):
-    if args.student_type == 'roberta':
+    if args.student_type == "roberta":
         student.roberta.embeddings.position_embeddings.weight.requires_grad = False
-    elif args.student_type == 'gpt2':
+    elif args.student_type == "gpt2":
         student.transformer.wpe.weight.requires_grad = False
 
+
 def freeze_token_type_embeddings(student, args):
-    if args.student_type == 'roberta':
+    if args.student_type == "roberta":
         student.roberta.embeddings.token_type_embeddings.weight.requires_grad = False
 
+
 def main():
     parser = argparse.ArgumentParser(description="Training")
-    parser.add_argument("--force", action='store_true',
-                        help="Overwrite dump_path if it already exists.")
+    parser.add_argument("--force", action="store_true", help="Overwrite dump_path if it already exists.")
 
-    parser.add_argument("--dump_path", type=str, required=True,
-                        help="The output directory (log, checkpoints, parameters, etc.)")
-    parser.add_argument("--data_file", type=str, required=True,
-                        help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.")
+    parser.add_argument(
+        "--dump_path", type=str, required=True, help="The output directory (log, checkpoints, parameters, etc.)"
+    )
+    parser.add_argument(
+        "--data_file",
+        type=str,
+        required=True,
+        help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.",
+    )
 
-    parser.add_argument("--student_type", type=str, choices=["distilbert", "roberta", "gpt2"], required=True,
-                        help="The student type (DistilBERT, RoBERTa).")
-    parser.add_argument("--student_config", type=str, required=True,
-                        help="Path to the student configuration.")
-    parser.add_argument("--student_pretrained_weights", default=None, type=str,
-                        help="Load student initialization checkpoint.")
+    parser.add_argument(
+        "--student_type",
+        type=str,
+        choices=["distilbert", "roberta", "gpt2"],
+        required=True,
+        help="The student type (DistilBERT, RoBERTa).",
+    )
+    parser.add_argument("--student_config", type=str, required=True, help="Path to the student configuration.")
+    parser.add_argument(
+        "--student_pretrained_weights", default=None, type=str, help="Load student initialization checkpoint."
+    )
 
-    parser.add_argument("--teacher_type", choices=["bert", "roberta", "gpt2"], required=True,
-                        help="Teacher type (BERT, RoBERTa).")
-    parser.add_argument("--teacher_name", type=str, required=True,
-                        help="The teacher model.")
+    parser.add_argument(
+        "--teacher_type", choices=["bert", "roberta", "gpt2"], required=True, help="Teacher type (BERT, RoBERTa)."
+    )
+    parser.add_argument("--teacher_name", type=str, required=True, help="The teacher model.")
 
-    parser.add_argument("--temperature", default=2., type=float,
-                        help="Temperature for the softmax temperature.")
-    parser.add_argument("--alpha_ce", default=0.5, type=float,
-                        help="Linear weight for the distillation loss. Must be >=0.")
-    parser.add_argument("--alpha_mlm", default=0.0, type=float,
-                        help="Linear weight for the MLM loss. Must be >=0. Should be used in coonjunction with `mlm` flag.")
-    parser.add_argument("--alpha_clm", default=0.5, type=float,
-                        help="Linear weight for the CLM loss. Must be >=0.")
-    parser.add_argument("--alpha_mse", default=0.0, type=float,
-                        help="Linear weight of the MSE loss. Must be >=0.")
-    parser.add_argument("--alpha_cos", default=0.0, type=float,
-                        help="Linear weight of the cosine embedding loss. Must be >=0.")
+    parser.add_argument("--temperature", default=2.0, type=float, help="Temperature for the softmax temperature.")
+    parser.add_argument(
+        "--alpha_ce", default=0.5, type=float, help="Linear weight for the distillation loss. Must be >=0."
+    )
+    parser.add_argument(
+        "--alpha_mlm",
+        default=0.0,
+        type=float,
+        help="Linear weight for the MLM loss. Must be >=0. Should be used in coonjunction with `mlm` flag.",
+    )
+    parser.add_argument("--alpha_clm", default=0.5, type=float, help="Linear weight for the CLM loss. Must be >=0.")
+    parser.add_argument("--alpha_mse", default=0.0, type=float, help="Linear weight of the MSE loss. Must be >=0.")
+    parser.add_argument(
+        "--alpha_cos", default=0.0, type=float, help="Linear weight of the cosine embedding loss. Must be >=0."
+    )
 
-    parser.add_argument("--mlm", action="store_true",
-                        help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM.")
-    parser.add_argument("--mlm_mask_prop", default=0.15, type=float,
-                        help="Proportion of tokens for which we need to make a prediction.")
-    parser.add_argument("--word_mask", default=0.8, type=float,
-                        help="Proportion of tokens to mask out.")
-    parser.add_argument("--word_keep", default=0.1, type=float,
-                        help="Proportion of tokens to keep.")
-    parser.add_argument("--word_rand", default=0.1, type=float,
-                        help="Proportion of tokens to randomly replace.")
-    parser.add_argument("--mlm_smoothing", default=0.7, type=float,
-                        help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).")
-    parser.add_argument("--token_counts", type=str,
-                        help="The token counts in the data_file for MLM.")
+    parser.add_argument(
+        "--mlm", action="store_true", help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM."
+    )
+    parser.add_argument(
+        "--mlm_mask_prop",
+        default=0.15,
+        type=float,
+        help="Proportion of tokens for which we need to make a prediction.",
+    )
+    parser.add_argument("--word_mask", default=0.8, type=float, help="Proportion of tokens to mask out.")
+    parser.add_argument("--word_keep", default=0.1, type=float, help="Proportion of tokens to keep.")
+    parser.add_argument("--word_rand", default=0.1, type=float, help="Proportion of tokens to randomly replace.")
+    parser.add_argument(
+        "--mlm_smoothing",
+        default=0.7,
+        type=float,
+        help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).",
+    )
+    parser.add_argument("--token_counts", type=str, help="The token counts in the data_file for MLM.")
 
-    parser.add_argument("--restrict_ce_to_mask", action='store_true',
-                        help="If true, compute the distilation loss only the [MLM] prediction distribution.")
-    parser.add_argument("--freeze_pos_embs", action="store_true",
-                        help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.")
-    parser.add_argument("--freeze_token_type_embds", action="store_true",
-                        help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.")
+    parser.add_argument(
+        "--restrict_ce_to_mask",
+        action="store_true",
+        help="If true, compute the distilation loss only the [MLM] prediction distribution.",
+    )
+    parser.add_argument(
+        "--freeze_pos_embs",
+        action="store_true",
+        help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.",
+    )
+    parser.add_argument(
+        "--freeze_token_type_embds",
+        action="store_true",
+        help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.",
+    )
 
-    parser.add_argument("--n_epoch", type=int, default=3,
-                        help="Number of pass on the whole dataset.")
-    parser.add_argument("--batch_size", type=int, default=5,
-                        help="Batch size (for each process).")
-    parser.add_argument("--group_by_size", action='store_false',
-                        help="If true, group sequences that have similar length into the same batch. Default is true.")
+    parser.add_argument("--n_epoch", type=int, default=3, help="Number of pass on the whole dataset.")
+    parser.add_argument("--batch_size", type=int, default=5, help="Batch size (for each process).")
+    parser.add_argument(
+        "--group_by_size",
+        action="store_false",
+        help="If true, group sequences that have similar length into the same batch. Default is true.",
+    )
 
-    parser.add_argument("--gradient_accumulation_steps", type=int, default=50,
-                        help="Gradient accumulation for larger training batches.")
-    parser.add_argument("--warmup_prop", default=0.05, type=float,
-                        help="Linear warmup proportion.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--learning_rate", default=5e-4, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--adam_epsilon", default=1e-6, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=5.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--initializer_range", default=0.02, type=float,
-                        help="Random initialization range.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=50,
+        help="Gradient accumulation for larger training batches.",
+    )
+    parser.add_argument("--warmup_prop", default=0.05, type=float, help="Linear warmup proportion.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--learning_rate", default=5e-4, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=5.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--initializer_range", default=0.02, type=float, help="Random initialization range.")
 
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--n_gpu", type=int, default=1,
-                        help="Number of GPUs in the node.")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="Distributed training - Local rank")
-    parser.add_argument("--seed", type=int, default=56,
-                        help="Random seed")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs in the node.")
+    parser.add_argument("--local_rank", type=int, default=-1, help="Distributed training - Local rank")
+    parser.add_argument("--seed", type=int, default=56, help="Random seed")
 
-    parser.add_argument("--log_interval", type=int, default=500,
-                        help="Tensorboard logging interval.")
-    parser.add_argument("--checkpoint_interval", type=int, default=4000,
-                        help="Checkpoint interval.")
+    parser.add_argument("--log_interval", type=int, default=500, help="Tensorboard logging interval.")
+    parser.add_argument("--checkpoint_interval", type=int, default=4000, help="Checkpoint interval.")
     args = parser.parse_args()
     sanity_checks(args)
 
-
     ## ARGS ##
     init_gpu_params(args)
     set_seed(args)
     if args.is_master:
         if os.path.exists(args.dump_path):
             if not args.force:
-                raise ValueError(f'Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it'
-                                   'Use `--force` if you want to overwrite it')
+                raise ValueError(
+                    f"Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it"
+                    "Use `--force` if you want to overwrite it"
+                )
             else:
                 shutil.rmtree(args.dump_path)
 
         if not os.path.exists(args.dump_path):
             os.makedirs(args.dump_path)
-        logger.info(f'Experiment will be dumped and logged in {args.dump_path}')
-
+        logger.info(f"Experiment will be dumped and logged in {args.dump_path}")
 
         ### SAVE PARAMS ###
-        logger.info(f'Param: {args}')
-        with open(os.path.join(args.dump_path, 'parameters.json'), 'w') as f:
+        logger.info(f"Param: {args}")
+        with open(os.path.join(args.dump_path, "parameters.json"), "w") as f:
             json.dump(vars(args), f, indent=4)
         git_log(args.dump_path)
 
@@ -207,58 +241,50 @@ def main():
     for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
         idx = tokenizer.all_special_tokens.index(tok_symbol)
         special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
-    logger.info(f'Special tokens {special_tok_ids}')
+    logger.info(f"Special tokens {special_tok_ids}")
     args.special_tok_ids = special_tok_ids
     args.max_model_input_size = tokenizer.max_model_input_sizes[args.teacher_name]
 
-
     ## DATA LOADER ##
-    logger.info(f'Loading data from {args.data_file}')
-    with open(args.data_file, 'rb') as fp:
+    logger.info(f"Loading data from {args.data_file}")
+    with open(args.data_file, "rb") as fp:
         data = pickle.load(fp)
 
-
     if args.mlm:
-        logger.info(f'Loading token counts from {args.token_counts} (already pre-computed)')
-        with open(args.token_counts, 'rb') as fp:
+        logger.info(f"Loading token counts from {args.token_counts} (already pre-computed)")
+        with open(args.token_counts, "rb") as fp:
             counts = pickle.load(fp)
-        
+
         token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
         for idx in special_tok_ids.values():
-            token_probs[idx] = 0.  # do not predict special tokens
+            token_probs[idx] = 0.0  # do not predict special tokens
         token_probs = torch.from_numpy(token_probs)
     else:
         token_probs = None
 
-
     train_lm_seq_dataset = LmSeqsDataset(params=args, data=data)
-    logger.info(f'Data loader created.')
-
+    logger.info(f"Data loader created.")
 
     ## STUDENT ##
-    logger.info(f'Loading student config from {args.student_config}')
+    logger.info(f"Loading student config from {args.student_config}")
     stu_architecture_config = student_config_class.from_pretrained(args.student_config)
     stu_architecture_config.output_hidden_states = True
 
     if args.student_pretrained_weights is not None:
-        logger.info(f'Loading pretrained weights from {args.student_pretrained_weights}')
-        student = student_model_class.from_pretrained(args.student_pretrained_weights,
-                                                      config=stu_architecture_config)
+        logger.info(f"Loading pretrained weights from {args.student_pretrained_weights}")
+        student = student_model_class.from_pretrained(args.student_pretrained_weights, config=stu_architecture_config)
     else:
         student = student_model_class(stu_architecture_config)
 
-
     if args.n_gpu > 0:
-        student.to(f'cuda:{args.local_rank}')
-    logger.info(f'Student loaded.')
-
+        student.to(f"cuda:{args.local_rank}")
+    logger.info(f"Student loaded.")
 
     ## TEACHER ##
     teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True)
     if args.n_gpu > 0:
-        teacher.to(f'cuda:{args.local_rank}')
-    logger.info(f'Teacher loaded from {args.teacher_name}.')
-
+        teacher.to(f"cuda:{args.local_rank}")
+    logger.info(f"Teacher loaded from {args.teacher_name}.")
 
     ## FREEZING ##
     if args.freeze_pos_embs:
@@ -266,7 +292,6 @@ def main():
     if args.freeze_token_type_embds:
         freeze_token_type_embeddings(student, args)
 
-
     ## SANITY CHECKS ##
     assert student.config.vocab_size == teacher.config.vocab_size
     assert student.config.hidden_size == teacher.config.hidden_size
@@ -274,14 +299,11 @@ def main():
     if args.mlm:
         assert token_probs.size(0) == stu_architecture_config.vocab_size
 
-
     ## DISTILLER ##
     torch.cuda.empty_cache()
-    distiller = Distiller(params=args,
-                          dataset=train_lm_seq_dataset,
-                          token_probs=token_probs,
-                          student=student,
-                          teacher=teacher)
+    distiller = Distiller(
+        params=args, dataset=train_lm_seq_dataset, token_probs=token_probs, student=student, teacher=teacher
+    )
     distiller.train()
     logger.info("Let's go get some drinks.")
 
diff --git a/examples/distillation/utils.py b/examples/distillation/utils.py
index 3d62504710..f9d7412cb5 100644
--- a/examples/distillation/utils.py
+++ b/examples/distillation/utils.py
@@ -23,9 +23,12 @@ import torch
 import numpy as np
 
 import logging
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
 logger = logging.getLogger(__name__)
 
 
@@ -35,12 +38,12 @@ def git_log(folder_path: str):
     """
     repo = git.Repo(search_parent_directories=True)
     repo_infos = {
-        'repo_id': str(repo),
-        'repo_sha': str(repo.head.object.hexsha),
-        'repo_branch': str(repo.active_branch)
+        "repo_id": str(repo),
+        "repo_sha": str(repo.head.object.hexsha),
+        "repo_branch": str(repo.active_branch),
     }
 
-    with open(os.path.join(folder_path, 'git_log.json'), 'w') as f:
+    with open(os.path.join(folder_path, "git_log.json"), "w") as f:
         json.dump(repo_infos, f, indent=4)
 
 
@@ -57,21 +60,21 @@ def init_gpu_params(params):
 
     assert torch.cuda.is_available()
 
-    logger.info('Initializing GPUs')
+    logger.info("Initializing GPUs")
     if params.n_gpu > 1:
         assert params.local_rank != -1
 
-        params.world_size = int(os.environ['WORLD_SIZE'])
-        params.n_gpu_per_node = int(os.environ['N_GPU_NODE'])
-        params.global_rank = int(os.environ['RANK'])
+        params.world_size = int(os.environ["WORLD_SIZE"])
+        params.n_gpu_per_node = int(os.environ["N_GPU_NODE"])
+        params.global_rank = int(os.environ["RANK"])
 
         # number of nodes / node ID
         params.n_nodes = params.world_size // params.n_gpu_per_node
         params.node_id = params.global_rank // params.n_gpu_per_node
         params.multi_gpu = True
 
-        assert params.n_nodes == int(os.environ['N_NODES'])
-        assert params.node_id == int(os.environ['NODE_RANK'])
+        assert params.n_nodes == int(os.environ["N_NODES"])
+        assert params.node_id == int(os.environ["NODE_RANK"])
 
     # local job (single GPU)
     else:
@@ -114,8 +117,7 @@ def init_gpu_params(params):
     if params.multi_gpu:
         logger.info("Initializing PyTorch distributed")
         torch.distributed.init_process_group(
-            init_method='env://',
-            backend='nccl',
+            init_method="env://", backend="nccl",
         )
 
 
diff --git a/examples/mm-imdb/run_mmimdb.py b/examples/mm-imdb/run_mmimdb.py
index f4a44bf62a..c92dbd3d36 100644
--- a/examples/mm-imdb/run_mmimdb.py
+++ b/examples/mm-imdb/run_mmimdb.py
@@ -40,29 +40,49 @@ from tqdm import tqdm, trange
 
 from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_mmimdb_labels, get_image_transforms
 
-from transformers import (WEIGHTS_NAME,
-                          BertConfig, BertModel, BertTokenizer,
-                          RobertaConfig, RobertaModel, RobertaTokenizer,
-                          XLMConfig, XLMModel, XLMTokenizer,
-                          XLNetConfig, XLNetModel, XLNetTokenizer,
-                          DistilBertConfig, DistilBertModel, DistilBertTokenizer,
-                          AlbertConfig, AlbertModel, AlbertTokenizer,
-                          MMBTForClassification, MMBTConfig)
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertModel,
+    BertTokenizer,
+    RobertaConfig,
+    RobertaModel,
+    RobertaTokenizer,
+    XLMConfig,
+    XLMModel,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetModel,
+    XLNetTokenizer,
+    DistilBertConfig,
+    DistilBertModel,
+    DistilBertTokenizer,
+    AlbertConfig,
+    AlbertModel,
+    AlbertTokenizer,
+    MMBTForClassification,
+    MMBTConfig,
+)
 
 from transformers import AdamW, get_linear_schedule_with_warmup
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig,
-                                                                                RobertaConfig, DistilBertConfig)), ())
+ALL_MODELS = sum(
+    (
+        tuple(conf.pretrained_config_archive_map.keys())
+        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
+    ),
+    (),
+)
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, BertModel, BertTokenizer),
-    'xlnet': (XLNetConfig, XLNetModel, XLNetTokenizer),
-    'xlm': (XLMConfig, XLMModel, XLMTokenizer),
-    'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertModel, DistilBertTokenizer),
-    'albert': (AlbertConfig, AlbertModel, AlbertTokenizer)
+    "bert": (BertConfig, BertModel, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetModel, XLNetTokenizer),
+    "xlm": (XLMConfig, XLMModel, XLMTokenizer),
+    "roberta": (RobertaConfig, RobertaModel, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertModel, DistilBertTokenizer),
+    "albert": (AlbertConfig, AlbertModel, AlbertTokenizer),
 }
 
 
@@ -81,10 +101,13 @@ def train(args, train_dataset, model, tokenizer, criterion):
 
     args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
     train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler,
-                                  batch_size=args.train_batch_size,
-                                  collate_fn=collate_fn,
-                                  num_workers=args.num_workers)
+    train_dataloader = DataLoader(
+        train_dataset,
+        sampler=train_sampler,
+        batch_size=args.train_batch_size,
+        collate_fn=collate_fn,
+        num_workers=args.num_workers,
+    )
 
     if args.max_steps > 0:
         t_total = args.max_steps
@@ -93,14 +116,19 @@ def train(args, train_dataset, model, tokenizer, criterion):
         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
     ]
 
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
     if args.fp16:
         try:
             from apex import amp
@@ -114,17 +142,21 @@ def train(args, train_dataset, model, tokenizer, criterion):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -140,17 +172,19 @@ def train(args, train_dataset, model, tokenizer, criterion):
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
             labels = batch[5]
-            inputs = {'input_ids':      batch[0],
-                      'input_modal':    batch[2],
-                      'attention_mask': batch[1],
-                      'modal_start_tokens': batch[3],
-                      'modal_end_tokens': batch[4]}
+            inputs = {
+                "input_ids": batch[0],
+                "input_modal": batch[2],
+                "attention_mask": batch[1],
+                "modal_start_tokens": batch[3],
+                "modal_end_tokens": batch[4],
+            }
             outputs = model(**inputs)
             logits = outputs[0]  # model outputs are always tuple in transformers (see doc)
             loss = criterion(logits, labels)
 
             if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
             if args.gradient_accumulation_steps > 1:
                 loss = loss / args.gradient_accumulation_steps
 
@@ -174,30 +208,34 @@ def train(args, train_dataset, model, tokenizer, criterion):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     logs = {}
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer, criterion)
                         for key, value in results.items():
-                            eval_key = 'eval_{}'.format(key)
+                            eval_key = "eval_{}".format(key)
                             logs[eval_key] = value
 
                     loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                     learning_rate_scalar = scheduler.get_lr()[0]
-                    logs['learning_rate'] = learning_rate_scalar
-                    logs['loss'] = loss_scalar
+                    logs["learning_rate"] = learning_rate_scalar
+                    logs["loss"] = loss_scalar
                     logging_loss = tr_loss
 
                     for key, value in logs.items():
                         tb_writer.add_scalar(key, value, global_step)
-                    print(json.dumps({**logs, **{'step': global_step}}))
+                    print(json.dumps({**logs, **{"step": global_step}}))
 
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                     # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                     torch.save(model_to_save.state_dict(), os.path.join(output_dir, WEIGHTS_NAME))
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
@@ -209,13 +247,13 @@ def train(args, train_dataset, model, tokenizer, criterion):
 
         if args.local_rank == -1:
             results = evaluate(args, model, tokenizer, criterion)
-            if results['micro_f1'] > best_f1:
-                best_f1 = results['micro_f1']
+            if results["micro_f1"] > best_f1:
+                best_f1 = results["micro_f1"]
                 n_no_improve = 0
             else:
                 n_no_improve += 1
 
-            if  n_no_improve > args.patience:
+            if n_no_improve > args.patience:
                 train_iterator.close()
                 break
 
@@ -236,7 +274,9 @@ def evaluate(args, model, tokenizer, criterion, prefix=""):
     args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
     # Note that DistributedSampler samples randomly
     eval_sampler = SequentialSampler(eval_dataset)
-    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn)
+    eval_dataloader = DataLoader(
+        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn
+    )
 
     # multi-gpu eval
     if args.n_gpu > 1:
@@ -257,11 +297,13 @@ def evaluate(args, model, tokenizer, criterion, prefix=""):
         with torch.no_grad():
             batch = tuple(t.to(args.device) for t in batch)
             labels = batch[5]
-            inputs = {'input_ids': batch[0],
-                      'input_modal': batch[2],
-                      'attention_mask': batch[1],
-                      'modal_start_tokens': batch[3],
-                      'modal_end_tokens': batch[4]}
+            inputs = {
+                "input_ids": batch[0],
+                "input_modal": batch[2],
+                "attention_mask": batch[1],
+                "modal_start_tokens": batch[3],
+                "modal_end_tokens": batch[4],
+            }
             outputs = model(**inputs)
             logits = outputs[0]  # model outputs are always tuple in transformers (see doc)
             tmp_eval_loss = criterion(logits, labels)
@@ -278,7 +320,7 @@ def evaluate(args, model, tokenizer, criterion, prefix=""):
     result = {
         "loss": eval_loss,
         "macro_f1": f1_score(out_label_ids, preds, average="macro"),
-        "micro_f1": f1_score(out_label_ids, preds, average="micro")
+        "micro_f1": f1_score(out_label_ids, preds, average="micro"),
     }
 
     output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
@@ -303,94 +345,147 @@ def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .jsonl files for MMIMDB.")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .jsonl files for MMIMDB.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
 
     ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--num_image_embeds", default=1, type=int,
-                        help="Number of Image Embeddings from the Image Encoder")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument(
+        "--num_image_embeds", default=1, type=int, help="Number of Image Embeddings from the Image Encoder"
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
 
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--patience", default=5, type=int,
-                        help="Patience for Early Stopping.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument("--patience", default=5, type=int, help="Patience for Early Stopping.")
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
 
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--num_workers', type=int, default=8,
-                        help="number of worker threads for dataloading")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument("--num_workers", type=int, default=8, help="number of worker threads for dataloading")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
 
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
     args = parser.parse_args()
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -402,17 +497,25 @@ def main():
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
         args.n_gpu = 1
 
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
@@ -426,13 +529,17 @@ def main():
     num_labels = len(labels)
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    transformer_config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    transformer = model_class.from_pretrained(args.model_name_or_path,
-                                              config=transformer_config,
-                                              cache_dir=args.cache_dir if args.cache_dir else None)
+    transformer_config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    transformer = model_class.from_pretrained(
+        args.model_name_or_path, config=transformer_config, cache_dir=args.cache_dir if args.cache_dir else None
+    )
     img_encoder = ImageEncoder(args)
     config = MMBTConfig(transformer_config, num_labels=num_labels)
     model = MMBTForClassification(config, transformer, img_encoder)
@@ -449,12 +556,13 @@ def main():
         train_dataset = load_examples(args, tokenizer, evaluate=False)
         label_frequences = train_dataset.get_label_frequencies()
         label_frequences = [label_frequences[l] for l in labels]
-        label_weights = (torch.tensor(label_frequences, device=args.device, dtype=torch.float) / len(train_dataset)) ** -1
+        label_weights = (
+            torch.tensor(label_frequences, device=args.device, dtype=torch.float) / len(train_dataset)
+        ) ** -1
         criterion = nn.BCEWithLogitsLoss(pos_weight=label_weights)
         global_step, tr_loss = train(args, train_dataset, model, tokenizer, criterion)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
@@ -464,12 +572,14 @@ def main():
         logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
         torch.save(model_to_save.state_dict(), os.path.join(args.output_dir, WEIGHTS_NAME))
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = MMBTForClassification(config, transformer, img_encoder)
@@ -477,24 +587,25 @@ def main():
         tokenizer = tokenizer_class.from_pretrained(args.output_dir)
         model.to(args.device)
 
-
     # Evaluation
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
             model = MMBTForClassification(config, transformer, img_encoder)
             model.load_state_dict(torch.load(checkpoint))
             model.to(args.device)
             result = evaluate(args, model, tokenizer, criterion, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
             results.update(result)
 
     return results
diff --git a/examples/mm-imdb/utils_mmimdb.py b/examples/mm-imdb/utils_mmimdb.py
index c59da02642..57cee25f9d 100644
--- a/examples/mm-imdb/utils_mmimdb.py
+++ b/examples/mm-imdb/utils_mmimdb.py
@@ -25,17 +25,7 @@ import torchvision
 import torchvision.transforms as transforms
 from torch.utils.data import Dataset
 
-POOLING_BREAKDOWN = {
-    1: (1, 1),
-    2: (2, 1),
-    3: (3, 1),
-    4: (2, 2),
-    5: (5, 1),
-    6: (3, 2),
-    7: (7, 1),
-    8: (4, 2),
-    9: (3, 3)
-}
+POOLING_BREAKDOWN = {1: (1, 1), 2: (2, 1), 3: (3, 1), 4: (2, 2), 5: (5, 1), 6: (3, 2), 7: (7, 1), 8: (4, 2), 9: (3, 3)}
 
 
 class ImageEncoder(nn.Module):
@@ -54,7 +44,6 @@ class ImageEncoder(nn.Module):
         return out  # BxNx2048
 
 
-
 class JsonlDataset(Dataset):
     def __init__(self, data_path, tokenizer, transforms, labels, max_seq_length):
         self.data = [json.loads(l) for l in open(data_path)]
@@ -72,7 +61,7 @@ class JsonlDataset(Dataset):
     def __getitem__(self, index):
         sentence = torch.LongTensor(self.tokenizer.encode(self.data[index]["text"], add_special_tokens=True))
         start_token, sentence, end_token = sentence[0], sentence[1:-1], sentence[-1]
-        sentence = sentence[:self.max_seq_length]
+        sentence = sentence[: self.max_seq_length]
 
         label = torch.zeros(self.n_classes)
         label[[self.labels.index(tgt) for tgt in self.data[index]["label"]]] = 1
@@ -80,8 +69,13 @@ class JsonlDataset(Dataset):
         image = Image.open(os.path.join(self.data_dir, self.data[index]["img"])).convert("RGB")
         image = self.transforms(image)
 
-        return {"image_start_token": start_token, "image_end_token": end_token,
-                "sentence": sentence, "image": image, "label": label}
+        return {
+            "image_start_token": start_token,
+            "image_end_token": end_token,
+            "sentence": sentence,
+            "image": image,
+            "label": label,
+        }
 
     def get_label_frequencies(self):
         label_freqs = Counter()
@@ -110,10 +104,31 @@ def collate_fn(batch):
 
 
 def get_mmimdb_labels():
-    return ['Crime', 'Drama', 'Thriller', 'Action', 'Comedy', 'Romance',
-            'Documentary', 'Short', 'Mystery', 'History', 'Family', 'Adventure',
-            'Fantasy', 'Sci-Fi', 'Western', 'Horror', 'Sport', 'War', 'Music',
-            'Musical', 'Animation', 'Biography', 'Film-Noir']
+    return [
+        "Crime",
+        "Drama",
+        "Thriller",
+        "Action",
+        "Comedy",
+        "Romance",
+        "Documentary",
+        "Short",
+        "Mystery",
+        "History",
+        "Family",
+        "Adventure",
+        "Fantasy",
+        "Sci-Fi",
+        "Western",
+        "Horror",
+        "Sport",
+        "War",
+        "Music",
+        "Musical",
+        "Animation",
+        "Biography",
+        "Film-Noir",
+    ]
 
 
 def get_image_transforms():
@@ -122,9 +137,6 @@ def get_image_transforms():
             transforms.Resize(256),
             transforms.CenterCrop(224),
             transforms.ToTensor(),
-            transforms.Normalize(
-                mean=[0.46777044, 0.44531429, 0.40661017],
-                std=[0.12221994, 0.12145835, 0.14380469],
-            ),
+            transforms.Normalize(mean=[0.46777044, 0.44531429, 0.40661017], std=[0.12221994, 0.12145835, 0.14380469],),
         ]
     )
diff --git a/examples/pplm/pplm_classification_head.py b/examples/pplm/pplm_classification_head.py
index 9aae0f17e9..05621c3bf2 100644
--- a/examples/pplm/pplm_classification_head.py
+++ b/examples/pplm/pplm_classification_head.py
@@ -1,5 +1,6 @@
 import torch
 
+
 class ClassificationHead(torch.nn.Module):
     """Classification Head for  transformer encoders"""
 
diff --git a/examples/pplm/run_pplm.py b/examples/pplm/run_pplm.py
index 095dc39a74..37183a5121 100644
--- a/examples/pplm/run_pplm.py
+++ b/examples/pplm/run_pplm.py
@@ -1,19 +1,19 @@
 #! /usr/bin/env python3
 # coding=utf-8
 
-#Copyright (c) 2019 Uber Technologies, Inc.
+# Copyright (c) 2019 Uber Technologies, Inc.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 """
 Example command with bag of words:
@@ -46,13 +46,13 @@ SMALL_CONST = 1e-15
 BIG_CONST = 1e10
 
 BAG_OF_WORDS_ARCHIVE_MAP = {
-    'legal': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/legal.txt",
-    'military': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt",
-    'politics': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/politics.txt",
-    'religion': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/religion.txt",
-    'science': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/science.txt",
-    'space': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/space.txt",
-    'technology': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/technology.txt",
+    "legal": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/legal.txt",
+    "military": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt",
+    "politics": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/politics.txt",
+    "religion": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/religion.txt",
+    "science": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/science.txt",
+    "space": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/space.txt",
+    "technology": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/technology.txt",
 }
 
 DISCRIMINATOR_MODELS_PARAMS = {
@@ -75,10 +75,10 @@ DISCRIMINATOR_MODELS_PARAMS = {
 }
 
 
-def to_var(x, requires_grad=False, volatile=False, device='cuda'):
-    if torch.cuda.is_available() and device == 'cuda':
+def to_var(x, requires_grad=False, volatile=False, device="cuda"):
+    if torch.cuda.is_available() and device == "cuda":
         x = x.cuda()
-    elif device != 'cuda':
+    elif device != "cuda":
         x = x.to(device)
     return Variable(x, requires_grad=requires_grad, volatile=volatile)
 
@@ -95,49 +95,39 @@ def top_k_filter(logits, k, probs=False):
         values = torch.topk(logits, k)[0]
         batch_mins = values[:, -1].view(-1, 1).expand_as(logits)
         if probs:
-            return torch.where(logits < batch_mins,
-                               torch.ones_like(logits) * 0.0, logits)
-        return torch.where(logits < batch_mins,
-                           torch.ones_like(logits) * -BIG_CONST,
-                           logits)
+            return torch.where(logits < batch_mins, torch.ones_like(logits) * 0.0, logits)
+        return torch.where(logits < batch_mins, torch.ones_like(logits) * -BIG_CONST, logits)
 
 
 def perturb_past(
-        past,
-        model,
-        last,
-        unpert_past=None,
-        unpert_logits=None,
-        accumulated_hidden=None,
-        grad_norms=None,
-        stepsize=0.01,
-        one_hot_bows_vectors=None,
-        classifier=None,
-        class_label=None,
-        loss_type=0,
-        num_iterations=3,
-        horizon_length=1,
-        window_length=0,
-        decay=False,
-        gamma=1.5,
-        kl_scale=0.01,
-        device='cuda',
+    past,
+    model,
+    last,
+    unpert_past=None,
+    unpert_logits=None,
+    accumulated_hidden=None,
+    grad_norms=None,
+    stepsize=0.01,
+    one_hot_bows_vectors=None,
+    classifier=None,
+    class_label=None,
+    loss_type=0,
+    num_iterations=3,
+    horizon_length=1,
+    window_length=0,
+    decay=False,
+    gamma=1.5,
+    kl_scale=0.01,
+    device="cuda",
 ):
     # Generate inital perturbed past
-    grad_accumulator = [
-        (np.zeros(p.shape).astype("float32"))
-        for p in past
-    ]
+    grad_accumulator = [(np.zeros(p.shape).astype("float32")) for p in past]
 
     if accumulated_hidden is None:
         accumulated_hidden = 0
 
     if decay:
-        decay_mask = torch.arange(
-            0.,
-            1.0 + SMALL_CONST,
-            1.0 / (window_length)
-        )[1:]
+        decay_mask = torch.arange(0.0, 1.0 + SMALL_CONST, 1.0 / (window_length))[1:]
     else:
         decay_mask = 1.0
 
@@ -146,26 +136,17 @@ def perturb_past(
     _, _, _, curr_length, _ = past[0].shape
 
     if curr_length > window_length and window_length > 0:
-        ones_key_val_shape = (
-                tuple(past[0].shape[:-2])
-                + tuple([window_length])
-                + tuple(past[0].shape[-1:])
-        )
+        ones_key_val_shape = tuple(past[0].shape[:-2]) + tuple([window_length]) + tuple(past[0].shape[-1:])
 
         zeros_key_val_shape = (
-                tuple(past[0].shape[:-2])
-                + tuple([curr_length - window_length])
-                + tuple(past[0].shape[-1:])
+            tuple(past[0].shape[:-2]) + tuple([curr_length - window_length]) + tuple(past[0].shape[-1:])
         )
 
         ones_mask = torch.ones(ones_key_val_shape)
         ones_mask = decay_mask * ones_mask.permute(0, 1, 2, 4, 3)
         ones_mask = ones_mask.permute(0, 1, 2, 4, 3)
 
-        window_mask = torch.cat(
-            (ones_mask, torch.zeros(zeros_key_val_shape)),
-            dim=-2
-        ).to(device)
+        window_mask = torch.cat((ones_mask, torch.zeros(zeros_key_val_shape)), dim=-2).to(device)
     else:
         window_mask = torch.ones_like(past[0]).to(device)
 
@@ -175,8 +156,7 @@ def perturb_past(
     for i in range(num_iterations):
         print("Iteration ", i + 1)
         curr_perturbation = [
-            to_var(torch.from_numpy(p_), requires_grad=True, device=device)
-            for p_ in grad_accumulator
+            to_var(torch.from_numpy(p_), requires_grad=True, device=device) for p_ in grad_accumulator
         ]
 
         # Compute hidden using perturbed past
@@ -184,10 +164,7 @@ def perturb_past(
         _, _, _, curr_length, _ = curr_perturbation[0].shape
         all_logits, _, all_hidden = model(last, past=perturbed_past)
         hidden = all_hidden[-1]
-        new_accumulated_hidden = accumulated_hidden + torch.sum(
-            hidden,
-            dim=1
-        ).detach()
+        new_accumulated_hidden = accumulated_hidden + torch.sum(hidden, dim=1).detach()
         # TODO: Check the layer-norm consistency of this with trained discriminator (Sumanth)
         logits = all_logits[:, -1, :]
         probs = F.softmax(logits, dim=-1)
@@ -210,20 +187,13 @@ def perturb_past(
             wte = model.resize_token_embeddings()
             for _ in range(horizon_length):
                 inputs_embeds = torch.matmul(curr_probs, wte.weight.data)
-                _, curr_unpert_past, curr_all_hidden = model(
-                    past=curr_unpert_past,
-                    inputs_embeds=inputs_embeds
-                )
+                _, curr_unpert_past, curr_all_hidden = model(past=curr_unpert_past, inputs_embeds=inputs_embeds)
                 curr_hidden = curr_all_hidden[-1]
-                new_accumulated_hidden = new_accumulated_hidden + torch.sum(
-                    curr_hidden, dim=1)
+                new_accumulated_hidden = new_accumulated_hidden + torch.sum(curr_hidden, dim=1)
 
-            prediction = classifier(new_accumulated_hidden /
-                                    (curr_length + 1 + horizon_length))
+            prediction = classifier(new_accumulated_hidden / (curr_length + 1 + horizon_length))
 
-            label = torch.tensor(prediction.shape[0] * [class_label],
-                                 device=device,
-                                 dtype=torch.long)
+            label = torch.tensor(prediction.shape[0] * [class_label], device=device, dtype=torch.long)
             discrim_loss = ce_loss(prediction, label)
             print(" pplm_discrim_loss:", discrim_loss.data.cpu().numpy())
             loss += discrim_loss
@@ -232,21 +202,15 @@ def perturb_past(
         kl_loss = 0.0
         if kl_scale > 0.0:
             unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
-            unpert_probs = (
-                    unpert_probs + SMALL_CONST *
-                    (unpert_probs <= SMALL_CONST).float().to(device).detach()
-            )
-            correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(
-                device).detach()
+            unpert_probs = unpert_probs + SMALL_CONST * (unpert_probs <= SMALL_CONST).float().to(device).detach()
+            correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(device).detach()
             corrected_probs = probs + correction.detach()
-            kl_loss = kl_scale * (
-                (corrected_probs * (corrected_probs / unpert_probs).log()).sum()
-            )
-            print(' kl_loss', kl_loss.data.cpu().numpy())
+            kl_loss = kl_scale * ((corrected_probs * (corrected_probs / unpert_probs).log()).sum())
+            print(" kl_loss", kl_loss.data.cpu().numpy())
             loss += kl_loss
 
         loss_per_iter.append(loss.data.cpu().numpy())
-        print(' pplm_loss', (loss - kl_loss).data.cpu().numpy())
+        print(" pplm_loss", (loss - kl_loss).data.cpu().numpy())
 
         # compute gradients
         loss.backward()
@@ -259,15 +223,12 @@ def perturb_past(
             ]
         else:
             grad_norms = [
-                (torch.norm(p_.grad * window_mask) + SMALL_CONST)
-                for index, p_ in enumerate(curr_perturbation)
+                (torch.norm(p_.grad * window_mask) + SMALL_CONST) for index, p_ in enumerate(curr_perturbation)
             ]
 
         # normalize gradients
         grad = [
-            -stepsize *
-            (p_.grad * window_mask / grad_norms[
-                index] ** gamma).data.cpu().numpy()
+            -stepsize * (p_.grad * window_mask / grad_norms[index] ** gamma).data.cpu().numpy()
             for index, p_ in enumerate(curr_perturbation)
         ]
 
@@ -285,36 +246,27 @@ def perturb_past(
         past = new_past
 
     # apply the accumulated perturbations to the past
-    grad_accumulator = [
-        to_var(torch.from_numpy(p_), requires_grad=True, device=device)
-        for p_ in grad_accumulator
-    ]
+    grad_accumulator = [to_var(torch.from_numpy(p_), requires_grad=True, device=device) for p_ in grad_accumulator]
     pert_past = list(map(add, past, grad_accumulator))
 
     return pert_past, new_accumulated_hidden, grad_norms, loss_per_iter
 
 
 def get_classifier(
-        name: Optional[str], class_label: Union[str, int],
-        device: str
+    name: Optional[str], class_label: Union[str, int], device: str
 ) -> Tuple[Optional[ClassificationHead], Optional[int]]:
     if name is None:
         return None, None
 
     params = DISCRIMINATOR_MODELS_PARAMS[name]
-    classifier = ClassificationHead(
-        class_size=params['class_size'],
-        embed_size=params['embed_size']
-    ).to(device)
+    classifier = ClassificationHead(class_size=params["class_size"], embed_size=params["embed_size"]).to(device)
     if "url" in params:
         resolved_archive_file = cached_path(params["url"])
     elif "path" in params:
         resolved_archive_file = params["path"]
     else:
-        raise ValueError("Either url or path have to be specified "
-                         "in the discriminator model parameters")
-    classifier.load_state_dict(
-        torch.load(resolved_archive_file, map_location=device))
+        raise ValueError("Either url or path have to be specified " "in the discriminator model parameters")
+    classifier.load_state_dict(torch.load(resolved_archive_file, map_location=device))
     classifier.eval()
 
     if isinstance(class_label, str):
@@ -341,8 +293,7 @@ def get_classifier(
     return classifier, label_id
 
 
-def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> \
-        List[List[List[int]]]:
+def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> List[List[List[int]]]:
     bow_indices = []
     for id_or_path in bag_of_words_ids_or_paths:
         if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP:
@@ -351,13 +302,11 @@ def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) ->
             filepath = id_or_path
         with open(filepath, "r") as f:
             words = f.read().strip().split("\n")
-        bow_indices.append(
-            [tokenizer.encode(word.strip(), add_prefix_space=True) for word in
-             words])
+        bow_indices.append([tokenizer.encode(word.strip(), add_prefix_space=True) for word in words])
     return bow_indices
 
 
-def build_bows_one_hot_vectors(bow_indices, tokenizer, device='cuda'):
+def build_bows_one_hot_vectors(bow_indices, tokenizer, device="cuda"):
     if bow_indices is None:
         return None
 
@@ -373,39 +322,34 @@ def build_bows_one_hot_vectors(bow_indices, tokenizer, device='cuda'):
 
 
 def full_text_generation(
-        model,
-        tokenizer,
-        context=None,
-        num_samples=1,
-        device="cuda",
-        bag_of_words=None,
-        discrim=None,
-        class_label=None,
-        length=100,
-        stepsize=0.02,
-        temperature=1.0,
-        top_k=10,
-        sample=False,
-        num_iterations=3,
-        grad_length=10000,
-        horizon_length=1,
-        window_length=0,
-        decay=False,
-        gamma=1.5,
-        gm_scale=0.9,
-        kl_scale=0.01,
-        **kwargs
+    model,
+    tokenizer,
+    context=None,
+    num_samples=1,
+    device="cuda",
+    bag_of_words=None,
+    discrim=None,
+    class_label=None,
+    length=100,
+    stepsize=0.02,
+    temperature=1.0,
+    top_k=10,
+    sample=False,
+    num_iterations=3,
+    grad_length=10000,
+    horizon_length=1,
+    window_length=0,
+    decay=False,
+    gamma=1.5,
+    gm_scale=0.9,
+    kl_scale=0.01,
+    **kwargs
 ):
-    classifier, class_id = get_classifier(
-        discrim,
-        class_label,
-        device
-    )
+    classifier, class_id = get_classifier(discrim, class_label, device)
 
     bow_indices = []
     if bag_of_words:
-        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
-                                               tokenizer)
+        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer)
 
     if bag_of_words and classifier:
         print("Both PPLM-BoW and PPLM-Discrim are on. This is not optimized.")
@@ -423,15 +367,9 @@ def full_text_generation(
         raise Exception("Specify either a bag of words or a discriminator")
 
     unpert_gen_tok_text, _, _ = generate_text_pplm(
-        model=model,
-        tokenizer=tokenizer,
-        context=context,
-        device=device,
-        length=length,
-        sample=sample,
-        perturb=False
+        model=model, tokenizer=tokenizer, context=context, device=device, length=length, sample=sample, perturb=False
     )
-    if device == 'cuda':
+    if device == "cuda":
         torch.cuda.empty_cache()
 
     pert_gen_tok_texts = []
@@ -468,36 +406,36 @@ def full_text_generation(
             discrim_losses.append(discrim_loss.data.cpu().numpy())
         losses_in_time.append(loss_in_time)
 
-    if device == 'cuda':
+    if device == "cuda":
         torch.cuda.empty_cache()
 
     return unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
 
 
 def generate_text_pplm(
-        model,
-        tokenizer,
-        context=None,
-        past=None,
-        device="cuda",
-        perturb=True,
-        bow_indices=None,
-        classifier=None,
-        class_label=None,
-        loss_type=0,
-        length=100,
-        stepsize=0.02,
-        temperature=1.0,
-        top_k=10,
-        sample=False,
-        num_iterations=3,
-        grad_length=10000,
-        horizon_length=1,
-        window_length=0,
-        decay=False,
-        gamma=1.5,
-        gm_scale=0.9,
-        kl_scale=0.01,
+    model,
+    tokenizer,
+    context=None,
+    past=None,
+    device="cuda",
+    perturb=True,
+    bow_indices=None,
+    classifier=None,
+    class_label=None,
+    loss_type=0,
+    length=100,
+    stepsize=0.02,
+    temperature=1.0,
+    top_k=10,
+    sample=False,
+    num_iterations=3,
+    grad_length=10000,
+    horizon_length=1,
+    window_length=0,
+    decay=False,
+    gamma=1.5,
+    gm_scale=0.9,
+    kl_scale=0.01,
 ):
     output_so_far = None
     if context:
@@ -507,8 +445,7 @@ def generate_text_pplm(
         output_so_far = context_t
 
     # collect one hot vectors for bags of words
-    one_hot_bows_vectors = build_bows_one_hot_vectors(bow_indices, tokenizer,
-                                                      device)
+    one_hot_bows_vectors = build_bows_one_hot_vectors(bow_indices, tokenizer, device)
 
     grad_norms = None
     last = None
@@ -575,13 +512,9 @@ def generate_text_pplm(
         if classifier is not None:
             ce_loss = torch.nn.CrossEntropyLoss()
             prediction = classifier(torch.mean(unpert_last_hidden, dim=1))
-            label = torch.tensor([class_label], device=device,
-                                 dtype=torch.long)
+            label = torch.tensor([class_label], device=device, dtype=torch.long)
             unpert_discrim_loss = ce_loss(prediction, label)
-            print(
-                "unperturbed discrim loss",
-                unpert_discrim_loss.data.cpu().numpy()
-            )
+            print("unperturbed discrim loss", unpert_discrim_loss.data.cpu().numpy())
         else:
             unpert_discrim_loss = 0
 
@@ -590,10 +523,8 @@ def generate_text_pplm(
 
             unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
 
-            pert_probs = ((pert_probs ** gm_scale) * (
-                    unpert_probs ** (1 - gm_scale)))  # + SMALL_CONST
-            pert_probs = top_k_filter(pert_probs, k=top_k,
-                                      probs=True)  # + SMALL_CONST
+            pert_probs = (pert_probs ** gm_scale) * (unpert_probs ** (1 - gm_scale))  # + SMALL_CONST
+            pert_probs = top_k_filter(pert_probs, k=top_k, probs=True)  # + SMALL_CONST
 
             # rescale
             if torch.sum(pert_probs) <= 1:
@@ -611,10 +542,7 @@ def generate_text_pplm(
             _, last = torch.topk(pert_probs, k=1, dim=-1)
 
         # update context/output_so_far appending the new token
-        output_so_far = (
-            last if output_so_far is None
-            else torch.cat((output_so_far, last), dim=1)
-        )
+        output_so_far = last if output_so_far is None else torch.cat((output_so_far, last), dim=1)
 
         print(tokenizer.decode(output_so_far.tolist()[0]))
 
@@ -623,44 +551,42 @@ def generate_text_pplm(
 
 def set_generic_model_params(discrim_weights, discrim_meta):
     if discrim_weights is None:
-        raise ValueError('When using a generic discriminator, '
-                         'discrim_weights need to be specified')
+        raise ValueError("When using a generic discriminator, " "discrim_weights need to be specified")
     if discrim_meta is None:
-        raise ValueError('When using a generic discriminator, '
-                         'discrim_meta need to be specified')
+        raise ValueError("When using a generic discriminator, " "discrim_meta need to be specified")
 
-    with open(discrim_meta, 'r') as discrim_meta_file:
+    with open(discrim_meta, "r") as discrim_meta_file:
         meta = json.load(discrim_meta_file)
-    meta['path'] = discrim_weights
-    DISCRIMINATOR_MODELS_PARAMS['generic'] = meta
+    meta["path"] = discrim_weights
+    DISCRIMINATOR_MODELS_PARAMS["generic"] = meta
 
 
 def run_pplm_example(
-        pretrained_model="gpt2-medium",
-        cond_text="",
-        uncond=False,
-        num_samples=1,
-        bag_of_words=None,
-        discrim=None,
-        discrim_weights=None,
-        discrim_meta=None,
-        class_label=-1,
-        length=100,
-        stepsize=0.02,
-        temperature=1.0,
-        top_k=10,
-        sample=False,
-        num_iterations=3,
-        grad_length=10000,
-        horizon_length=1,
-        window_length=0,
-        decay=False,
-        gamma=1.5,
-        gm_scale=0.9,
-        kl_scale=0.01,
-        seed=0,
-        no_cuda=False,
-        colorama=False
+    pretrained_model="gpt2-medium",
+    cond_text="",
+    uncond=False,
+    num_samples=1,
+    bag_of_words=None,
+    discrim=None,
+    discrim_weights=None,
+    discrim_meta=None,
+    class_label=-1,
+    length=100,
+    stepsize=0.02,
+    temperature=1.0,
+    top_k=10,
+    sample=False,
+    num_iterations=3,
+    grad_length=10000,
+    horizon_length=1,
+    window_length=0,
+    decay=False,
+    gamma=1.5,
+    gm_scale=0.9,
+    kl_scale=0.01,
+    seed=0,
+    no_cuda=False,
+    colorama=False,
 ):
     # set Random seed
     torch.manual_seed(seed)
@@ -669,21 +595,15 @@ def run_pplm_example(
     # set the device
     device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
 
-    if discrim == 'generic':
+    if discrim == "generic":
         set_generic_model_params(discrim_weights, discrim_meta)
 
     if discrim is not None:
-        pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][
-            "pretrained_model"
-        ]
-        print("discrim = {}, pretrained_model set "
-              "to discriminator's = {}".format(discrim, pretrained_model))
+        pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim]["pretrained_model"]
+        print("discrim = {}, pretrained_model set " "to discriminator's = {}".format(discrim, pretrained_model))
 
     # load pretrained model
-    model = GPT2LMHeadModel.from_pretrained(
-        pretrained_model,
-        output_hidden_states=True
-    )
+    model = GPT2LMHeadModel.from_pretrained(pretrained_model, output_hidden_states=True)
     model.to(device)
     model.eval()
 
@@ -696,9 +616,7 @@ def run_pplm_example(
 
     # figure out conditioning text
     if uncond:
-        tokenized_cond_text = tokenizer.encode(
-            [tokenizer.bos_token]
-        )
+        tokenized_cond_text = tokenizer.encode([tokenizer.bos_token])
     else:
         raw_text = cond_text
         while not raw_text:
@@ -750,8 +668,7 @@ def run_pplm_example(
 
     bow_word_ids = set()
     if bag_of_words and colorama:
-        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
-                                               tokenizer)
+        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer)
         for single_bow_list in bow_indices:
             # filtering all words in the list composed of more than 1 token
             filtered = list(filter(lambda x: len(x) <= 1, single_bow_list))
@@ -765,13 +682,11 @@ def run_pplm_example(
             if colorama:
                 import colorama
 
-                pert_gen_text = ''
+                pert_gen_text = ""
                 for word_id in pert_gen_tok_text.tolist()[0]:
                     if word_id in bow_word_ids:
-                        pert_gen_text += '{}{}{}'.format(
-                            colorama.Fore.RED,
-                            tokenizer.decode([word_id]),
-                            colorama.Style.RESET_ALL
+                        pert_gen_text += "{}{}{}".format(
+                            colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL
                         )
                     else:
                         pert_gen_text += tokenizer.decode([word_id])
@@ -785,14 +700,12 @@ def run_pplm_example(
             pass
 
         # keep the prefix, perturbed seq, original seq for each index
-        generated_texts.append(
-            (tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text)
-        )
+        generated_texts.append((tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text))
 
     return
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--pretrained_model",
@@ -801,19 +714,10 @@ if __name__ == '__main__':
         default="gpt2-medium",
         help="pretrained model name or path to local checkpoint",
     )
+    parser.add_argument("--cond_text", type=str, default="The lake", help="Prefix texts to condition on")
+    parser.add_argument("--uncond", action="store_true", help="Generate from end-of-text as prefix")
     parser.add_argument(
-        "--cond_text", type=str, default="The lake",
-        help="Prefix texts to condition on"
-    )
-    parser.add_argument(
-        "--uncond", action="store_true",
-        help="Generate from end-of-text as prefix"
-    )
-    parser.add_argument(
-        "--num_samples",
-        type=int,
-        default=1,
-        help="Number of samples to generate from the modified latents",
+        "--num_samples", type=int, default=1, help="Number of samples to generate from the modified latents",
     )
     parser.add_argument(
         "--bag_of_words",
@@ -821,8 +725,8 @@ if __name__ == '__main__':
         type=str,
         default=None,
         help="Bags of words used for PPLM-BoW. "
-             "Either a BOW id (see list in code) or a filepath. "
-             "Multiple BoWs separated by ;",
+        "Either a BOW id (see list in code) or a filepath. "
+        "Multiple BoWs separated by ;",
     )
     parser.add_argument(
         "--discrim",
@@ -832,48 +736,36 @@ if __name__ == '__main__':
         choices=("clickbait", "sentiment", "toxicity", "generic"),
         help="Discriminator to use",
     )
-    parser.add_argument('--discrim_weights', type=str, default=None,
-                        help='Weights for the generic discriminator')
-    parser.add_argument('--discrim_meta', type=str, default=None,
-                        help='Meta information for the generic discriminator')
+    parser.add_argument("--discrim_weights", type=str, default=None, help="Weights for the generic discriminator")
     parser.add_argument(
-        "--class_label",
-        type=int,
-        default=-1,
-        help="Class label used for the discriminator",
+        "--discrim_meta", type=str, default=None, help="Meta information for the generic discriminator"
+    )
+    parser.add_argument(
+        "--class_label", type=int, default=-1, help="Class label used for the discriminator",
     )
     parser.add_argument("--length", type=int, default=100)
     parser.add_argument("--stepsize", type=float, default=0.02)
     parser.add_argument("--temperature", type=float, default=1.0)
     parser.add_argument("--top_k", type=int, default=10)
-    parser.add_argument(
-        "--sample", action="store_true",
-        help="Generate from end-of-text as prefix"
-    )
+    parser.add_argument("--sample", action="store_true", help="Generate from end-of-text as prefix")
     parser.add_argument("--num_iterations", type=int, default=3)
     parser.add_argument("--grad_length", type=int, default=10000)
     parser.add_argument(
         "--window_length",
         type=int,
         default=0,
-        help="Length of past which is being optimized; "
-             "0 corresponds to infinite window length",
+        help="Length of past which is being optimized; " "0 corresponds to infinite window length",
     )
     parser.add_argument(
-        "--horizon_length",
-        type=int,
-        default=1,
-        help="Length of future to optimize over",
+        "--horizon_length", type=int, default=1, help="Length of future to optimize over",
     )
-    parser.add_argument("--decay", action="store_true",
-                        help="whether to decay or not")
+    parser.add_argument("--decay", action="store_true", help="whether to decay or not")
     parser.add_argument("--gamma", type=float, default=1.5)
     parser.add_argument("--gm_scale", type=float, default=0.9)
     parser.add_argument("--kl_scale", type=float, default=0.01)
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument("--no_cuda", action="store_true", help="no cuda")
-    parser.add_argument("--colorama", action="store_true",
-                        help="colors keywords")
+    parser.add_argument("--colorama", action="store_true", help="colors keywords")
 
     args = parser.parse_args()
     run_pplm_example(**vars(args))
diff --git a/examples/pplm/run_pplm_discrim_train.py b/examples/pplm/run_pplm_discrim_train.py
index 3055139d8c..14136c4c7a 100644
--- a/examples/pplm/run_pplm_discrim_train.py
+++ b/examples/pplm/run_pplm_discrim_train.py
@@ -1,19 +1,19 @@
 #! /usr/bin/env python3
 # coding=utf-8
 
-#Copyright (c) 2019 Uber Technologies, Inc.
+# Copyright (c) 2019 Uber Technologies, Inc.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import argparse
 import csv
@@ -42,26 +42,15 @@ example_sentence = "This is incredible! I love it, this is the best chicken I ha
 max_length_seq = 100
 
 
-
-
 class Discriminator(torch.nn.Module):
     """Transformer encoder followed by a Classification Head"""
 
-    def __init__(
-            self,
-            class_size,
-            pretrained_model="gpt2-medium",
-            cached_mode=False,
-            device='cpu'
-    ):
+    def __init__(self, class_size, pretrained_model="gpt2-medium", cached_mode=False, device="cpu"):
         super(Discriminator, self).__init__()
         self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
         self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
         self.embed_size = self.encoder.transformer.config.hidden_size
-        self.classifier_head = ClassificationHead(
-            class_size=class_size,
-            embed_size=self.embed_size
-        )
+        self.classifier_head = ClassificationHead(class_size=class_size, embed_size=self.embed_size)
         self.cached_mode = cached_mode
         self.device = device
 
@@ -74,14 +63,10 @@ class Discriminator(torch.nn.Module):
         self.classifier_head.train()
 
     def avg_representation(self, x):
-        mask = x.ne(0).unsqueeze(2).repeat(
-            1, 1, self.embed_size
-        ).float().to(self.device).detach()
+        mask = x.ne(0).unsqueeze(2).repeat(1, 1, self.embed_size).float().to(self.device).detach()
         hidden, _ = self.encoder.transformer(x)
         masked_hidden = hidden * mask
-        avg_hidden = torch.sum(masked_hidden, dim=1) / (
-                torch.sum(mask, dim=1).detach() + EPSILON
-        )
+        avg_hidden = torch.sum(masked_hidden, dim=1) / (torch.sum(mask, dim=1).detach() + EPSILON)
         return avg_hidden
 
     def forward(self, x):
@@ -117,10 +102,7 @@ def collate_fn(data):
     def pad_sequences(sequences):
         lengths = [len(seq) for seq in sequences]
 
-        padded_sequences = torch.zeros(
-            len(sequences),
-            max(lengths)
-        ).long()  # padding value = 0
+        padded_sequences = torch.zeros(len(sequences), max(lengths)).long()  # padding value = 0
 
         for i, seq in enumerate(sequences):
             end = lengths[i]
@@ -149,8 +131,7 @@ def cached_collate_fn(data):
     return x_batch, y_batch
 
 
-def train_epoch(data_loader, discriminator, optimizer,
-                epoch=0, log_interval=10, device='cpu'):
+def train_epoch(data_loader, discriminator, optimizer, epoch=0, log_interval=10, device="cpu"):
     samples_so_far = 0
     discriminator.train_custom()
     for batch_idx, (input_t, target_t) in enumerate(data_loader):
@@ -169,13 +150,15 @@ def train_epoch(data_loader, discriminator, optimizer,
             print(
                 "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                     epoch + 1,
-                    samples_so_far, len(data_loader.dataset),
-                    100 * samples_so_far / len(data_loader.dataset), loss.item()
+                    samples_so_far,
+                    len(data_loader.dataset),
+                    100 * samples_so_far / len(data_loader.dataset),
+                    loss.item(),
                 )
             )
 
 
-def evaluate_performance(data_loader, discriminator, device='cpu'):
+def evaluate_performance(data_loader, discriminator, device="cpu"):
     discriminator.eval()
     test_loss = 0
     correct = 0
@@ -194,13 +177,12 @@ def evaluate_performance(data_loader, discriminator, device='cpu'):
     print(
         "Performance on test set: "
         "Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)".format(
-            test_loss, correct, len(data_loader.dataset),
-            100. * correct / len(data_loader.dataset)
+            test_loss, correct, len(data_loader.dataset), 100.0 * correct / len(data_loader.dataset)
         )
     )
 
 
-def predict(input_sentence, model, classes, cached=False, device='cpu'):
+def predict(input_sentence, model, classes, cached=False, device="cpu"):
     input_t = model.tokenizer.encode(input_sentence)
     input_t = torch.tensor([input_t], dtype=torch.long, device=device)
     if cached:
@@ -208,17 +190,14 @@ def predict(input_sentence, model, classes, cached=False, device='cpu'):
 
     log_probs = model(input_t).data.cpu().numpy().flatten().tolist()
     print("Input sentence:", input_sentence)
-    print("Predictions:", ", ".join(
-        "{}: {:.4f}".format(c, math.exp(log_prob)) for c, log_prob in
-        zip(classes, log_probs)
-    ))
+    print(
+        "Predictions:",
+        ", ".join("{}: {:.4f}".format(c, math.exp(log_prob)) for c, log_prob in zip(classes, log_probs)),
+    )
 
 
-def get_cached_data_loader(dataset, batch_size, discriminator,
-                           shuffle=False, device='cpu'):
-    data_loader = torch.utils.data.DataLoader(dataset=dataset,
-                                              batch_size=batch_size,
-                                              collate_fn=collate_fn)
+def get_cached_data_loader(dataset, batch_size, discriminator, shuffle=False, device="cpu"):
+    data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=collate_fn)
 
     xs = []
     ys = []
@@ -231,50 +210,44 @@ def get_cached_data_loader(dataset, batch_size, discriminator,
             ys += y.cpu().numpy().tolist()
 
     data_loader = torch.utils.data.DataLoader(
-        dataset=Dataset(xs, ys),
-        batch_size=batch_size,
-        shuffle=shuffle,
-        collate_fn=cached_collate_fn)
+        dataset=Dataset(xs, ys), batch_size=batch_size, shuffle=shuffle, collate_fn=cached_collate_fn
+    )
 
     return data_loader
 
 
 def train_discriminator(
-        dataset, dataset_fp=None, pretrained_model="gpt2-medium",
-        epochs=10, batch_size=64, log_interval=10,
-        save_model=False, cached=False, no_cuda=False):
+    dataset,
+    dataset_fp=None,
+    pretrained_model="gpt2-medium",
+    epochs=10,
+    batch_size=64,
+    log_interval=10,
+    save_model=False,
+    cached=False,
+    no_cuda=False,
+):
     device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
 
     print("Preprocessing {} dataset...".format(dataset))
     start = time.time()
 
     if dataset == "SST":
-        idx2class = ["positive", "negative", "very positive", "very negative",
-                     "neutral"]
+        idx2class = ["positive", "negative", "very positive", "very negative", "neutral"]
         class2idx = {c: i for i, c in enumerate(idx2class)}
 
         discriminator = Discriminator(
-            class_size=len(idx2class),
-            pretrained_model=pretrained_model,
-            cached_mode=cached,
-            device=device
+            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
         ).to(device)
 
         text = torchtext_data.Field()
         label = torchtext_data.Field(sequential=False)
-        train_data, val_data, test_data = datasets.SST.splits(
-            text,
-            label,
-            fine_grained=True,
-            train_subtrees=True,
-        )
+        train_data, val_data, test_data = datasets.SST.splits(text, label, fine_grained=True, train_subtrees=True,)
 
         x = []
         y = []
         for i in trange(len(train_data), ascii=True):
-            seq = TreebankWordDetokenizer().detokenize(
-                vars(train_data[i])["text"]
-            )
+            seq = TreebankWordDetokenizer().detokenize(vars(train_data[i])["text"])
             seq = discriminator.tokenizer.encode(seq)
             seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
             x.append(seq)
@@ -284,9 +257,7 @@ def train_discriminator(
         test_x = []
         test_y = []
         for i in trange(len(test_data), ascii=True):
-            seq = TreebankWordDetokenizer().detokenize(
-                vars(test_data[i])["text"]
-            )
+            seq = TreebankWordDetokenizer().detokenize(vars(test_data[i])["text"])
             seq = discriminator.tokenizer.encode(seq)
             seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
             test_x.append(seq)
@@ -306,10 +277,7 @@ def train_discriminator(
         class2idx = {c: i for i, c in enumerate(idx2class)}
 
         discriminator = Discriminator(
-            class_size=len(idx2class),
-            pretrained_model=pretrained_model,
-            cached_mode=cached,
-            device=device
+            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
         ).to(device)
 
         with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
@@ -318,9 +286,7 @@ def train_discriminator(
                 try:
                     data.append(eval(line))
                 except:
-                    print("Error evaluating line {}: {}".format(
-                        i, line
-                    ))
+                    print("Error evaluating line {}: {}".format(i, line))
                     continue
         x = []
         y = []
@@ -331,27 +297,20 @@ def train_discriminator(
                     seq = discriminator.tokenizer.encode(d["text"])
 
                     if len(seq) < max_length_seq:
-                        seq = torch.tensor(
-                            [50256] + seq, device=device, dtype=torch.long
-                        )
+                        seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
                     else:
-                        print("Line {} is longer than maximum length {}".format(
-                            i, max_length_seq
-                        ))
+                        print("Line {} is longer than maximum length {}".format(i, max_length_seq))
                         continue
                     x.append(seq)
                     y.append(d["label"])
                 except:
-                    print("Error evaluating / tokenizing"
-                          " line {}, skipping it".format(i))
+                    print("Error evaluating / tokenizing" " line {}, skipping it".format(i))
                     pass
 
         full_dataset = Dataset(x, y)
         train_size = int(0.9 * len(full_dataset))
         test_size = len(full_dataset) - train_size
-        train_dataset, test_dataset = torch.utils.data.random_split(
-            full_dataset, [train_size, test_size]
-        )
+        train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
 
         discriminator_meta = {
             "class_size": len(idx2class),
@@ -366,10 +325,7 @@ def train_discriminator(
         class2idx = {c: i for i, c in enumerate(idx2class)}
 
         discriminator = Discriminator(
-            class_size=len(idx2class),
-            pretrained_model=pretrained_model,
-            cached_mode=cached,
-            device=device
+            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
         ).to(device)
 
         x = []
@@ -381,27 +337,20 @@ def train_discriminator(
                     seq = discriminator.tokenizer.encode(d["text"])
 
                     if len(seq) < max_length_seq:
-                        seq = torch.tensor(
-                            [50256] + seq, device=device, dtype=torch.long
-                        )
+                        seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
                     else:
-                        print("Line {} is longer than maximum length {}".format(
-                            i, max_length_seq
-                        ))
+                        print("Line {} is longer than maximum length {}".format(i, max_length_seq))
                         continue
                     x.append(seq)
                     y.append(int(np.sum(d["label"]) > 0))
                 except:
-                    print("Error evaluating / tokenizing"
-                          " line {}, skipping it".format(i))
+                    print("Error evaluating / tokenizing" " line {}, skipping it".format(i))
                     pass
 
         full_dataset = Dataset(x, y)
         train_size = int(0.9 * len(full_dataset))
         test_size = len(full_dataset) - train_size
-        train_dataset, test_dataset = torch.utils.data.random_split(
-            full_dataset, [train_size, test_size]
-        )
+        train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
 
         discriminator_meta = {
             "class_size": len(idx2class),
@@ -416,8 +365,7 @@ def train_discriminator(
         # class \t text
 
         if dataset_fp is None:
-            raise ValueError("When generic dataset is selected, "
-                             "dataset_fp needs to be specified aswell.")
+            raise ValueError("When generic dataset is selected, " "dataset_fp needs to be specified aswell.")
 
         classes = set()
         with open(dataset_fp) as f:
@@ -430,10 +378,7 @@ def train_discriminator(
         class2idx = {c: i for i, c in enumerate(idx2class)}
 
         discriminator = Discriminator(
-            class_size=len(idx2class),
-            pretrained_model=pretrained_model,
-            cached_mode=cached,
-            device=device
+            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
         ).to(device)
 
         x = []
@@ -447,18 +392,11 @@ def train_discriminator(
 
                     try:
                         seq = discriminator.tokenizer.encode(text)
-                        if (len(seq) < max_length_seq):
-                            seq = torch.tensor(
-                                [50256] + seq,
-                                device=device,
-                                dtype=torch.long
-                            )
+                        if len(seq) < max_length_seq:
+                            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
 
                         else:
-                            print(
-                                "Line {} is longer than maximum length {}".format(
-                                    i, max_length_seq
-                                ))
+                            print("Line {} is longer than maximum length {}".format(i, max_length_seq))
                             continue
 
                         x.append(seq)
@@ -471,10 +409,7 @@ def train_discriminator(
         full_dataset = Dataset(x, y)
         train_size = int(0.9 * len(full_dataset))
         test_size = len(full_dataset) - train_size
-        train_dataset, test_dataset = torch.utils.data.random_split(
-            full_dataset,
-            [train_size, test_size]
-        )
+        train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
 
         discriminator_meta = {
             "class_size": len(idx2class),
@@ -485,9 +420,7 @@ def train_discriminator(
         }
 
     end = time.time()
-    print("Preprocessed {} data points".format(
-        len(train_dataset) + len(test_dataset))
-    )
+    print("Preprocessed {} data points".format(len(train_dataset) + len(test_dataset)))
     print("Data preprocessing took: {:.3f}s".format(end - start))
 
     if cached:
@@ -495,30 +428,21 @@ def train_discriminator(
 
         start = time.time()
 
-        train_loader = get_cached_data_loader(
-            train_dataset, batch_size, discriminator,
-            shuffle=True, device=device
-        )
+        train_loader = get_cached_data_loader(train_dataset, batch_size, discriminator, shuffle=True, device=device)
 
-        test_loader = get_cached_data_loader(
-            test_dataset, batch_size, discriminator, device=device
-        )
+        test_loader = get_cached_data_loader(test_dataset, batch_size, discriminator, device=device)
 
         end = time.time()
         print("Building representation cache took: {:.3f}s".format(end - start))
 
     else:
-        train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
-                                                   batch_size=batch_size,
-                                                   shuffle=True,
-                                                   collate_fn=collate_fn)
-        test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
-                                                  batch_size=batch_size,
-                                                  collate_fn=collate_fn)
+        train_loader = torch.utils.data.DataLoader(
+            dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
+        )
+        test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, collate_fn=collate_fn)
 
     if save_model:
-        with open("{}_classifier_head_meta.json".format(dataset),
-                  "w") as meta_file:
+        with open("{}_classifier_head_meta.json".format(dataset), "w") as meta_file:
             json.dump(discriminator_meta, meta_file)
 
     optimizer = optim.Adam(discriminator.parameters(), lr=0.0001)
@@ -533,56 +457,61 @@ def train_discriminator(
             optimizer=optimizer,
             epoch=epoch,
             log_interval=log_interval,
-            device=device
-        )
-        evaluate_performance(
-            data_loader=test_loader,
-            discriminator=discriminator,
-            device=device
+            device=device,
         )
+        evaluate_performance(data_loader=test_loader, discriminator=discriminator, device=device)
 
         end = time.time()
         print("Epoch took: {:.3f}s".format(end - start))
 
         print("\nExample prediction")
-        predict(example_sentence, discriminator, idx2class,
-                cached=cached, device=device)
+        predict(example_sentence, discriminator, idx2class, cached=cached, device=device)
 
         if save_model:
             # torch.save(discriminator.state_dict(),
             #           "{}_discriminator_{}.pt".format(
             #               args.dataset, epoch + 1
             #               ))
-            torch.save(discriminator.get_classifier().state_dict(),
-                       "{}_classifier_head_epoch_{}.pt".format(dataset,
-                                                               epoch + 1))
+            torch.save(
+                discriminator.get_classifier().state_dict(),
+                "{}_classifier_head_epoch_{}.pt".format(dataset, epoch + 1),
+            )
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Train a discriminator on top of GPT-2 representations")
-    parser.add_argument("--dataset", type=str, default="SST",
-                        choices=("SST", "clickbait", "toxic", "generic"),
-                        help="dataset to train the discriminator on."
-                             "In case of generic, the dataset is expected"
-                             "to be a TSBV file with structure: class \\t text")
-    parser.add_argument("--dataset_fp", type=str, default="",
-                        help="File path of the dataset to use. "
-                             "Needed only in case of generic datadset")
-    parser.add_argument("--pretrained_model", type=str, default="gpt2-medium",
-                        help="Pretrained model to use as encoder")
-    parser.add_argument("--epochs", type=int, default=10, metavar="N",
-                        help="Number of training epochs")
-    parser.add_argument("--batch_size", type=int, default=64, metavar="N",
-                        help="input batch size for training (default: 64)")
-    parser.add_argument("--log_interval", type=int, default=10, metavar="N",
-                        help="how many batches to wait before logging training status")
-    parser.add_argument("--save_model", action="store_true",
-                        help="whether to save the model")
-    parser.add_argument("--cached", action="store_true",
-                        help="whether to cache the input representations")
-    parser.add_argument("--no_cuda", action="store_true",
-                        help="use to turn off cuda")
+    parser = argparse.ArgumentParser(description="Train a discriminator on top of GPT-2 representations")
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="SST",
+        choices=("SST", "clickbait", "toxic", "generic"),
+        help="dataset to train the discriminator on."
+        "In case of generic, the dataset is expected"
+        "to be a TSBV file with structure: class \\t text",
+    )
+    parser.add_argument(
+        "--dataset_fp",
+        type=str,
+        default="",
+        help="File path of the dataset to use. " "Needed only in case of generic datadset",
+    )
+    parser.add_argument(
+        "--pretrained_model", type=str, default="gpt2-medium", help="Pretrained model to use as encoder"
+    )
+    parser.add_argument("--epochs", type=int, default=10, metavar="N", help="Number of training epochs")
+    parser.add_argument(
+        "--batch_size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)"
+    )
+    parser.add_argument(
+        "--log_interval",
+        type=int,
+        default=10,
+        metavar="N",
+        help="how many batches to wait before logging training status",
+    )
+    parser.add_argument("--save_model", action="store_true", help="whether to save the model")
+    parser.add_argument("--cached", action="store_true", help="whether to cache the input representations")
+    parser.add_argument("--no_cuda", action="store_true", help="use to turn off cuda")
     args = parser.parse_args()
 
     train_discriminator(**(vars(args)))
diff --git a/examples/run_bertology.py b/examples/run_bertology.py
index d1d05a1073..6b4739d6bd 100644
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -32,10 +32,18 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse
 from torch.utils.data.distributed import DistributedSampler
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from transformers import (WEIGHTS_NAME,
-                                  BertConfig, BertForSequenceClassification, BertTokenizer,
-                                  XLMConfig, XLMForSequenceClassification, XLMTokenizer,
-                                  XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForSequenceClassification,
+    BertTokenizer,
+    XLMConfig,
+    XLMForSequenceClassification,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetForSequenceClassification,
+    XLNetTokenizer,
+)
 
 from run_glue import set_seed, load_and_cache_examples, ALL_MODELS, MODEL_CLASSES
 
@@ -63,7 +71,9 @@ def print_2d_tensor(tensor):
             logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:d}" for x in tensor[row].cpu().data))
 
 
-def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None):
+def compute_heads_importance(
+    args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None
+):
     """ This method shows how to compute:
         - head attention entropy
         - head importance scores according to http://arxiv.org/abs/1905.10650
@@ -85,8 +95,14 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
         input_ids, input_mask, segment_ids, label_ids = batch
 
         # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
-        outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask)
-        loss, logits, all_attentions = outputs[0], outputs[1], outputs[-1]  # Loss and logits are the first, attention the last
+        outputs = model(
+            input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask
+        )
+        loss, logits, all_attentions = (
+            outputs[0],
+            outputs[1],
+            outputs[-1],
+        )  # Loss and logits are the first, attention the last
         loss.backward()  # Backpropagate to populate the gradients in the head mask
 
         if compute_entropy:
@@ -113,15 +129,15 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
     # Layerwise importance normalization
     if not args.dont_normalize_importance_by_layer:
         exponent = 2
-        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1/exponent)
+        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1 / exponent)
         head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20
 
     if not args.dont_normalize_global_importance:
         head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())
 
     # Print/save matrices
-    np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy.detach().cpu().numpy())
-    np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance.detach().cpu().numpy())
+    np.save(os.path.join(args.output_dir, "attn_entropy.npy"), attn_entropy.detach().cpu().numpy())
+    np.save(os.path.join(args.output_dir, "head_importance.npy"), head_importance.detach().cpu().numpy())
 
     logger.info("Attention entropies")
     print_2d_tensor(attn_entropy)
@@ -129,7 +145,9 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
     print_2d_tensor(head_importance)
     logger.info("Head ranked by importance scores")
     head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
-    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel(), device=args.device)
+    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(
+        head_importance.numel(), device=args.device
+    )
     head_ranks = head_ranks.view_as(head_importance)
     print_2d_tensor(head_ranks)
 
@@ -150,9 +168,9 @@ def mask_heads(args, model, eval_dataloader):
 
     current_score = original_score
     while current_score >= original_score * args.masking_threshold:
-        head_mask = new_head_mask.clone() # save current head mask
+        head_mask = new_head_mask.clone()  # save current head mask
         # heads from least important to most - keep only not-masked heads
-        head_importance[head_mask == 0.0] = float('Inf')
+        head_importance[head_mask == 0.0] = float("Inf")
         current_heads_to_mask = head_importance.view(-1).sort()[1]
 
         if len(current_heads_to_mask) <= num_to_mask:
@@ -167,14 +185,21 @@ def mask_heads(args, model, eval_dataloader):
         print_2d_tensor(new_head_mask)
 
         # Compute metric and head importance again
-        _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask)
+        _, head_importance, preds, labels = compute_heads_importance(
+            args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask
+        )
         preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
         current_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
-        logger.info("Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum()/new_head_mask.numel() * 100)
+        logger.info(
+            "Masking: current score: %f, remaning heads %d (%.1f percents)",
+            current_score,
+            new_head_mask.sum(),
+            new_head_mask.sum() / new_head_mask.numel() * 100,
+        )
 
     logger.info("Final head mask")
     print_2d_tensor(head_mask)
-    np.save(os.path.join(args.output_dir, 'head_mask.npy'), head_mask.detach().cpu().numpy())
+    np.save(os.path.join(args.output_dir, "head_mask.npy"), head_mask.detach().cpu().numpy())
 
     return head_mask
 
@@ -186,8 +211,9 @@ def prune_heads(args, model, eval_dataloader, head_mask):
     # Try pruning and test time speedup
     # Pruning is like masking but we actually remove the masked weights
     before_time = datetime.now()
-    _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
-                                                   compute_entropy=False, compute_importance=False, head_mask=head_mask)
+    _, _, preds, labels = compute_heads_importance(
+        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask
+    )
     preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
     score_masking = compute_metrics(args.task_name, preds, labels)[args.metric_name]
     original_time = datetime.now() - before_time
@@ -199,73 +225,127 @@ def prune_heads(args, model, eval_dataloader, head_mask):
     pruned_num_params = sum(p.numel() for p in model.parameters())
 
     before_time = datetime.now()
-    _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
-                                                    compute_entropy=False, compute_importance=False, head_mask=None)
+    _, _, preds, labels = compute_heads_importance(
+        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=None
+    )
     preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
     score_pruning = compute_metrics(args.task_name, preds, labels)[args.metric_name]
     new_time = datetime.now() - before_time
 
-    logger.info("Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100)
+    logger.info(
+        "Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)",
+        original_num_params,
+        pruned_num_params,
+        pruned_num_params / original_num_params * 100,
+    )
     logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
-    logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100)
+    logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time / new_time * 100)
 
 
 def main():
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
-                            ALL_MODELS))
-    parser.add_argument("--task_name", default=None, type=str, required=True,
-                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
 
     ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name_or_path")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name_or_path")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--data_subset", type=int, default=-1,
-                        help="If > 0: limit the data to a subset of data_subset instances.")
-    parser.add_argument("--overwrite_output_dir", action='store_true',
-                        help="Whether to overwrite data in output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument(
+        "--config_name",
+        default="",
+        type=str,
+        help="Pretrained config name or path if not the same as model_name_or_path",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name_or_path",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances."
+    )
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Whether to overwrite data in output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
 
-    parser.add_argument("--dont_normalize_importance_by_layer", action='store_true',
-                        help="Don't normalize importance score by layers")
-    parser.add_argument("--dont_normalize_global_importance", action='store_true',
-                        help="Don't normalize all importance scores between 0 and 1")
+    parser.add_argument(
+        "--dont_normalize_importance_by_layer", action="store_true", help="Don't normalize importance score by layers"
+    )
+    parser.add_argument(
+        "--dont_normalize_global_importance",
+        action="store_true",
+        help="Don't normalize all importance scores between 0 and 1",
+    )
 
-    parser.add_argument("--try_masking", action='store_true',
-                        help="Whether to try to mask head until a threshold of accuracy.")
-    parser.add_argument("--masking_threshold", default=0.9, type=float,
-                        help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).")
-    parser.add_argument("--masking_amount", default=0.1, type=float,
-                        help="Amount to heads to masking at each masking step.")
-    parser.add_argument("--metric_name", default="acc", type=str,
-                        help="Metric to use for head masking.")
+    parser.add_argument(
+        "--try_masking", action="store_true", help="Whether to try to mask head until a threshold of accuracy."
+    )
+    parser.add_argument(
+        "--masking_threshold",
+        default=0.9,
+        type=float,
+        help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).",
+    )
+    parser.add_argument(
+        "--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step."
+    )
+    parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.")
 
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. \n"
-                             "Sequences longer than this will be truncated, sequences shorter padded.")
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after WordPiece tokenization. \n"
+        "Sequences longer than this will be truncated, sequences shorter padded.",
+    )
     parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")
 
     parser.add_argument("--seed", type=int, default=42)
     parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-    parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
     args = parser.parse_args()
 
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -278,10 +358,10 @@ def main():
         torch.cuda.set_device(args.local_rank)
         args.device = torch.device("cuda", args.local_rank)
         args.n_gpu = 1
-        torch.distributed.init_process_group(backend='nccl')  # Initializes the distributed backend
+        torch.distributed.init_process_group(backend="nccl")  # Initializes the distributed backend
 
     # Setup logging
-    logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
     logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1)))
 
     # Set seeds
@@ -306,17 +386,23 @@ def main():
             args.model_type = key  # take the first match in model types
             break
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels,
-                                          finetuning_task=args.task_name,
-                                          output_attentions=True,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        output_attentions=True,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -324,14 +410,14 @@ def main():
     # Distributed and parallel training
     model.to(args.device)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
     elif args.n_gpu > 1:
         model = torch.nn.DataParallel(model)
 
     # Print/save training arguments
-    torch.save(args, os.path.join(args.output_dir, 'run_args.bin'))
+    torch.save(args, os.path.join(args.output_dir, "run_args.bin"))
     logger.info("Training/evaluation parameters %s", args)
 
     # Prepare dataset for the GLUE task
@@ -341,11 +427,9 @@ def main():
     eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
     eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
 
-
     # Compute head entropy and importance score
     compute_heads_importance(args, model, eval_dataloader)
 
-
     # Try head masking (set heads to zero until the score goes under a threshole)
     # and head pruning (remove masked heads and see the effect on the network)
     if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
@@ -353,5 +437,5 @@ def main():
         prune_heads(args, model, eval_dataloader, head_mask)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/run_generation.py b/examples/run_generation.py
index 536d4a18f0..e62ccf87c6 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -33,9 +33,7 @@ from transformers import XLMWithLMHeadModel, XLMTokenizer
 
 
 logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO,
 )
 logger = logging.getLogger(__name__)
 
@@ -71,6 +69,7 @@ def set_seed(args):
     if args.n_gpu > 0:
         torch.cuda.manual_seed_all(args.seed)
 
+
 #
 # Functions to prepare models' input
 #
@@ -78,15 +77,11 @@ def set_seed(args):
 
 def prepare_ctrl_input(args, _, tokenizer, prompt_text):
     if args.temperature > 0.7:
-        logger.info(
-            "CTRL typically works better with lower temperatures (and lower top_k)."
-        )
+        logger.info("CTRL typically works better with lower temperatures (and lower top_k).")
 
     encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False)
     if not any(encoded_prompt[0] == x for x in tokenizer.control_codes.values()):
-        logger.info(
-            "WARNING! You are not starting your generation from a control code so you won't get good results"
-        )
+        logger.info("WARNING! You are not starting your generation from a control code so you won't get good results")
     return prompt_text
 
 
@@ -102,11 +97,7 @@ def prepare_xlm_input(args, model, tokenizer, prompt_text):
         else:
             language = None
             while language not in available_languages:
-                language = input(
-                    "Using XLM. Select language in "
-                    + str(list(available_languages))
-                    + " >>> "
-                )
+                language = input("Using XLM. Select language in " + str(list(available_languages)) + " >>> ")
         # kwargs["language"] = tokenizer.lang2id[language]
 
     # TODO fix mask_token_id setup when configurations will be synchronized between models and tokenizers
@@ -148,17 +139,34 @@ def adjust_length_to_model(length, max_sequence_length):
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
 
     parser.add_argument("--prompt", type=str, default="")
     parser.add_argument("--length", type=int, default=20)
     parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped")
 
-    parser.add_argument("--temperature", type=float, default=1.0, help="temperature of 1.0 has no effect, lower tend toward greedy sampling")
-    parser.add_argument("--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2")
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=1.0,
+        help="temperature of 1.0 has no effect, lower tend toward greedy sampling",
+    )
+    parser.add_argument(
+        "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2"
+    )
     parser.add_argument("--k", type=int, default=0)
     parser.add_argument("--p", type=float, default=0.9)
 
@@ -169,9 +177,7 @@ def main():
     parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
     args = parser.parse_args()
 
-    args.device = torch.device(
-        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
-    )
+    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
     args.n_gpu = torch.cuda.device_count()
 
     set_seed(args)
@@ -181,17 +187,13 @@ def main():
         args.model_type = args.model_type.lower()
         model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
     except KeyError:
-        raise KeyError(
-            "the model {} you specified is not supported. You are welcome to add it and open a PR :)"
-        )
+        raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")
 
     tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
     model = model_class.from_pretrained(args.model_name_or_path)
     model.to(args.device)
 
-    args.length = adjust_length_to_model(
-        args.length, max_sequence_length=model.config.max_position_embeddings
-    )
+    args.length = adjust_length_to_model(args.length, max_sequence_length=model.config.max_position_embeddings)
     logger.info(args)
 
     prompt_text = args.prompt if args.prompt else input("Model prompt >>> ")
@@ -201,7 +203,7 @@ def main():
     if requires_preprocessing:
         prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type)
         prompt_text = prepare_input(args, model, tokenizer, prompt_text)
-    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors='pt')
+    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
 
     output_sequences = model.generate(
         input_ids=encoded_prompt,
@@ -212,7 +214,7 @@ def main():
         repetition_penalty=args.repetition_penalty,
     )
 
-    # Batch size == 1. to add more examples please use num_return_sequences > 1 
+    # Batch size == 1. to add more examples please use num_return_sequences > 1
     generated_sequence = output_sequences[0].tolist()
     text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
     text = text[: t.find(args.stop_token) if args.stop_token else None]
diff --git a/examples/run_glue.py b/examples/run_glue.py
index c143b6205b..bbfd52ea3d 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -26,8 +26,7 @@ import json
 
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 
 try:
@@ -37,25 +36,30 @@ except:
 
 from tqdm import tqdm, trange
 
-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForSequenceClassification, BertTokenizer,
-                                  RobertaConfig,
-                                  RobertaForSequenceClassification,
-                                  RobertaTokenizer,
-                                  XLMConfig, XLMForSequenceClassification,
-                                  XLMTokenizer, XLNetConfig,
-                                  XLNetForSequenceClassification,
-                                  XLNetTokenizer,
-                                  DistilBertConfig,
-                                  DistilBertForSequenceClassification,
-                                  DistilBertTokenizer,
-                                  AlbertConfig,
-                                  AlbertForSequenceClassification, 
-                                  AlbertTokenizer,
-                                  XLMRobertaConfig,
-                                  XLMRobertaForSequenceClassification,
-                                  XLMRobertaTokenizer,
-                                )
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForSequenceClassification,
+    BertTokenizer,
+    RobertaConfig,
+    RobertaForSequenceClassification,
+    RobertaTokenizer,
+    XLMConfig,
+    XLMForSequenceClassification,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetForSequenceClassification,
+    XLNetTokenizer,
+    DistilBertConfig,
+    DistilBertForSequenceClassification,
+    DistilBertTokenizer,
+    AlbertConfig,
+    AlbertForSequenceClassification,
+    AlbertTokenizer,
+    XLMRobertaConfig,
+    XLMRobertaForSequenceClassification,
+    XLMRobertaTokenizer,
+)
 
 from transformers import AdamW, get_linear_schedule_with_warmup
 
@@ -66,17 +70,22 @@ from transformers import glue_convert_examples_to_features as convert_examples_t
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, 
-                                                                                RobertaConfig, DistilBertConfig)), ())
+ALL_MODELS = sum(
+    (
+        tuple(conf.pretrained_config_archive_map.keys())
+        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
+    ),
+    (),
+)
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
-    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
-    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
-    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
-    'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
-    'xlmroberta': (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer),
+    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
+    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
+    "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
+    "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
+    "xlmroberta": (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer),
 }
 
 
@@ -104,20 +113,27 @@ def train(args, train_dataset, model, tokenizer):
         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
 
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
 
     # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
         # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
 
     if args.fp16:
         try:
@@ -132,17 +148,21 @@ def train(args, train_dataset, model, tokenizer):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -152,7 +172,7 @@ def train(args, train_dataset, model, tokenizer):
     # Check if continuing training from a checkpoint
     if os.path.exists(args.model_name_or_path):
         # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
+        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
         epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
         steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
 
@@ -163,7 +183,9 @@ def train(args, train_dataset, model, tokenizer):
 
     tr_loss, logging_loss = 0.0, 0.0
     model.zero_grad()
-    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
     set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
@@ -176,16 +198,16 @@ def train(args, train_dataset, model, tokenizer):
 
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      'labels':         batch[3]}
-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = (
+                    batch[2] if args.model_type in ["bert", "xlnet"] else None
+                )  # XLM, DistilBERT and RoBERTa don't use segment_ids
             outputs = model(**inputs)
             loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
             if args.gradient_accumulation_steps > 1:
                 loss = loss / args.gradient_accumulation_steps
 
@@ -209,36 +231,40 @@ def train(args, train_dataset, model, tokenizer):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     logs = {}
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
-                            eval_key = 'eval_{}'.format(key)
+                            eval_key = "eval_{}".format(key)
                             logs[eval_key] = value
 
                     loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                     learning_rate_scalar = scheduler.get_lr()[0]
-                    logs['learning_rate'] = learning_rate_scalar
-                    logs['loss'] = loss_scalar
+                    logs["learning_rate"] = learning_rate_scalar
+                    logs["loss"] = loss_scalar
                     logging_loss = tr_loss
 
                     for key, value in logs.items():
                         tb_writer.add_scalar(key, value, global_step)
-                    print(json.dumps({**logs, **{'step': global_step}}))
+                    print(json.dumps({**logs, **{"step": global_step}}))
 
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                     # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
                     tokenizer.save_pretrained(output_dir)
 
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                     logger.info("Saving optimizer and scheduler states to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
@@ -257,7 +283,7 @@ def train(args, train_dataset, model, tokenizer):
 def evaluate(args, model, tokenizer, prefix=""):
     # Loop to handle MNLI double evaluation (matched, mis-matched)
     eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
-    eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
+    eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)
 
     results = {}
     for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
@@ -288,11 +314,11 @@ def evaluate(args, model, tokenizer, prefix=""):
             batch = tuple(t.to(args.device) for t in batch)
 
             with torch.no_grad():
-                inputs = {'input_ids':      batch[0],
-                          'attention_mask': batch[1],
-                          'labels':         batch[3]}
-                if args.model_type != 'distilbert':
-                    inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
+                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+                if args.model_type != "distilbert":
+                    inputs["token_type_ids"] = (
+                        batch[2] if args.model_type in ["bert", "xlnet"] else None
+                    )  # XLM, DistilBERT and RoBERTa don't use segment_ids
                 outputs = model(**inputs)
                 tmp_eval_loss, logits = outputs[:2]
 
@@ -300,10 +326,10 @@ def evaluate(args, model, tokenizer, prefix=""):
             nb_eval_steps += 1
             if preds is None:
                 preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs['labels'].detach().cpu().numpy()
+                out_label_ids = inputs["labels"].detach().cpu().numpy()
             else:
                 preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
 
         eval_loss = eval_loss / nb_eval_steps
         if args.output_mode == "classification":
@@ -330,29 +356,36 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
     processor = processors[task]()
     output_mode = output_modes[task]
     # Load data features from cache or dataset file
-    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
-        'dev' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length),
-        str(task)))
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+            str(task),
+        ),
+    )
     if os.path.exists(cached_features_file) and not args.overwrite_cache:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", args.data_dir)
         label_list = processor.get_labels()
-        if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta', 'xlmroberta']:
+        if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta", "xlmroberta"]:
             # HACK(label indices are swapped in RoBERTa pretrained model)
             label_list[1], label_list[2] = label_list[2], label_list[1]
-        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-        features = convert_examples_to_features(examples,
-                                                tokenizer,
-                                                label_list=label_list,
-                                                max_length=args.max_seq_length,
-                                                output_mode=output_mode,
-                                                pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
-                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-                                                pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
+        examples = (
+            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+        )
+        features = convert_examples_to_features(
+            examples,
+            tokenizer,
+            label_list=label_list,
+            max_length=args.max_seq_length,
+            output_mode=output_mode,
+            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
         )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
@@ -369,7 +402,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
         all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
     elif output_mode == "regression":
         all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
- 
+
     dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
     return dataset
 
@@ -378,90 +411,149 @@ def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--task_name", default=None, type=str, required=True,
-                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
 
     ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
 
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")     
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
 
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
 
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
     args = parser.parse_args()
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -473,16 +565,24 @@ def main():
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
         args.n_gpu = 1
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
@@ -502,17 +602,23 @@ def main():
 
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels,
-                                          finetuning_task=args.task_name,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -521,14 +627,12 @@ def main():
 
     logger.info("Training/evaluation parameters %s", args)
 
-
     # Training
     if args.do_train:
         train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
         global_step, tr_loss = train(args, train_dataset, model, tokenizer)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
@@ -538,36 +642,39 @@ def main():
         logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
         tokenizer = tokenizer_class.from_pretrained(args.output_dir)
         model.to(args.device)
 
-
     # Evaluation
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
             result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
             results.update(result)
 
     return results
diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index b659e229bf..60b99f29d4 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -42,37 +42,55 @@ except:
 
 from tqdm import tqdm, trange
 
-from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
-                                  BertConfig, BertForMaskedLM, BertTokenizer,
-                                  GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
-                                  OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
-                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
-                                  DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer,
-                                  CamembertConfig, CamembertForMaskedLM, CamembertTokenizer)
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    get_linear_schedule_with_warmup,
+    BertConfig,
+    BertForMaskedLM,
+    BertTokenizer,
+    GPT2Config,
+    GPT2LMHeadModel,
+    GPT2Tokenizer,
+    OpenAIGPTConfig,
+    OpenAIGPTLMHeadModel,
+    OpenAIGPTTokenizer,
+    RobertaConfig,
+    RobertaForMaskedLM,
+    RobertaTokenizer,
+    DistilBertConfig,
+    DistilBertForMaskedLM,
+    DistilBertTokenizer,
+    CamembertConfig,
+    CamembertForMaskedLM,
+    CamembertTokenizer,
+)
 
 
 logger = logging.getLogger(__name__)
 
 
 MODEL_CLASSES = {
-    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
-    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
-    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
-    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
-    'camembert': (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer)
+    "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
+    "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
+    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
+    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
+    "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),
 }
 
 
 class TextDataset(Dataset):
-    def __init__(self, tokenizer, args, file_path='train', block_size=512):
+    def __init__(self, tokenizer, args, file_path="train", block_size=512):
         assert os.path.isfile(file_path)
         directory, filename = os.path.split(file_path)
-        cached_features_file = os.path.join(directory, args.model_name_or_path + '_cached_lm_' + str(block_size) + '_' + filename)
+        cached_features_file = os.path.join(
+            directory, args.model_name_or_path + "_cached_lm_" + str(block_size) + "_" + filename
+        )
 
         if os.path.exists(cached_features_file) and not args.overwrite_cache:
             logger.info("Loading features from cached file %s", cached_features_file)
-            with open(cached_features_file, 'rb') as handle:
+            with open(cached_features_file, "rb") as handle:
                 self.examples = pickle.load(handle)
         else:
             logger.info("Creating features from dataset file at %s", directory)
@@ -83,14 +101,14 @@ class TextDataset(Dataset):
 
             tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
 
-            for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
-                self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size]))
+            for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
+                self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]))
             # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
             # If your dataset is small, first you should loook for a bigger one :-) and second you
             # can change this behavior by adding (model specific) padding.
 
             logger.info("Saving features into cached file %s", cached_features_file)
-            with open(cached_features_file, 'wb') as handle:
+            with open(cached_features_file, "wb") as handle:
                 pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
 
     def __len__(self):
@@ -101,7 +119,12 @@ class TextDataset(Dataset):
 
 
 def load_and_cache_examples(args, tokenizer, evaluate=False):
-    dataset = TextDataset(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size)
+    dataset = TextDataset(
+        tokenizer,
+        args,
+        file_path=args.eval_data_file if evaluate else args.train_data_file,
+        block_size=args.block_size,
+    )
     return dataset
 
 
@@ -120,7 +143,7 @@ def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False):
         return
 
     # Check if we should delete older checkpoint(s)
-    glob_checkpoints = glob.glob(os.path.join(args.output_dir, '{}-*'.format(checkpoint_prefix)))
+    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))
     if len(glob_checkpoints) <= args.save_total_limit:
         return
 
@@ -129,7 +152,7 @@ def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False):
         if use_mtime:
             ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
         else:
-            regex_match = re.match('.*{}-([0-9]+)'.format(checkpoint_prefix), path)
+            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
             if regex_match and regex_match.groups():
                 ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
 
@@ -147,7 +170,9 @@ def mask_tokens(inputs, tokenizer, args):
     labels = inputs.clone()
     # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
     probability_matrix = torch.full(labels.shape, args.mlm_probability)
-    special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
+    special_tokens_mask = [
+        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+    ]
     probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
     masked_indices = torch.bernoulli(probability_matrix).bool()
     labels[~masked_indices] = -100  # We only compute loss on masked tokens
@@ -181,19 +206,26 @@ def train(args, train_dataset, model, tokenizer):
         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
 
     # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
         # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
 
     if args.fp16:
         try:
@@ -208,17 +240,21 @@ def train(args, train_dataset, model, tokenizer):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -228,7 +264,7 @@ def train(args, train_dataset, model, tokenizer):
     # Check if continuing training from a checkpoint
     if os.path.exists(args.model_name_or_path):
         # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
+        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
         epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
         steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
 
@@ -239,16 +275,18 @@ def train(args, train_dataset, model, tokenizer):
 
     tr_loss, logging_loss = 0.0, 0.0
 
-    model_to_resize = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+    model_to_resize = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
     model_to_resize.resize_token_embeddings(len(tokenizer))
 
     model.zero_grad()
-    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
     set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
-            
+
             # Skip past any already trained steps if resuming training
             if steps_trained_in_current_epoch > 0:
                 steps_trained_in_current_epoch -= 1
@@ -285,31 +323,35 @@ def train(args, train_dataset, model, tokenizer):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                     logging_loss = tr_loss
 
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    checkpoint_prefix = 'checkpoint'
+                    checkpoint_prefix = "checkpoint"
                     # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step))
+                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
                     tokenizer.save_pretrained(output_dir)
 
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
                     _rotate_checkpoints(args, checkpoint_prefix)
 
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                     logger.info("Saving optimizer and scheduler states to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
@@ -365,9 +407,7 @@ def evaluate(args, model, tokenizer, prefix=""):
     eval_loss = eval_loss / nb_eval_steps
     perplexity = torch.exp(torch.tensor(eval_loss))
 
-    result = {
-        "perplexity": perplexity
-    }
+    result = {"perplexity": perplexity}
 
     output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
     with open(output_eval_file, "w") as writer:
@@ -383,107 +423,167 @@ def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--train_data_file", default=None, type=str, required=True,
-                        help="The input training data file (a text file).")
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file)."
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
 
     ## Other parameters
-    parser.add_argument("--eval_data_file", default=None, type=str,
-                        help="An optional input evaluation data file to evaluate the perplexity on (a text file).")
+    parser.add_argument(
+        "--eval_data_file",
+        default=None,
+        type=str,
+        help="An optional input evaluation data file to evaluate the perplexity on (a text file).",
+    )
 
-    parser.add_argument("--model_type", default="bert", type=str,
-                        help="The model architecture to be fine-tuned.")
-    parser.add_argument("--model_name_or_path", default="bert-base-cased", type=str,
-                        help="The model checkpoint for weights initialization.")
+    parser.add_argument("--model_type", default="bert", type=str, help="The model architecture to be fine-tuned.")
+    parser.add_argument(
+        "--model_name_or_path",
+        default="bert-base-cased",
+        type=str,
+        help="The model checkpoint for weights initialization.",
+    )
 
-    parser.add_argument("--mlm", action='store_true',
-                        help="Train with masked-language modeling loss instead of language modeling.")
-    parser.add_argument("--mlm_probability", type=float, default=0.15,
-                        help="Ratio of tokens to mask for masked language modeling loss")
+    parser.add_argument(
+        "--mlm", action="store_true", help="Train with masked-language modeling loss instead of language modeling."
+    )
+    parser.add_argument(
+        "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss"
+    )
 
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Optional pretrained config name or path if not the same as model_name_or_path")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)")
-    parser.add_argument("--block_size", default=-1, type=int,
-                        help="Optional input sequence length after tokenization."
-                             "The training dataset will be truncated in block of this size for training."
-                             "Default to the model max input length for single sentence inputs (take into account special tokens).")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Run evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--config_name",
+        default="",
+        type=str,
+        help="Optional pretrained config name or path if not the same as model_name_or_path",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)",
+    )
+    parser.add_argument(
+        "--block_size",
+        default=-1,
+        type=int,
+        help="Optional input sequence length after tokenization."
+        "The training dataset will be truncated in block of this size for training."
+        "Default to the model max input length for single sentence inputs (take into account special tokens).",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
 
-    parser.add_argument("--per_gpu_train_batch_size", default=4, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=4, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=1.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
 
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument('--save_total_limit', type=int, default=None,
-                        help='Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default')
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--save_total_limit",
+        type=int,
+        default=None,
+        help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default",
+    )
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
 
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
     args = parser.parse_args()
 
     if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
-        raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
-                         "flag (masked language modeling).")
+        raise ValueError(
+            "BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
+            "flag (masked language modeling)."
+        )
     if args.eval_data_file is None and args.do_eval:
-        raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
-                         "or remove the --do_eval argument.")
+        raise ValueError(
+            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
+            "or remove the --do_eval argument."
+        )
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -495,16 +595,24 @@ def main():
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
         args.n_gpu = 1
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
@@ -514,18 +622,26 @@ def main():
         torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training download model & vocab
 
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
     if args.block_size <= 0:
-        args.block_size = tokenizer.max_len_single_sentence  # Our input block size will be the max possible for the model
+        args.block_size = (
+            tokenizer.max_len_single_sentence
+        )  # Our input block size will be the max possible for the model
     args.block_size = min(args.block_size, tokenizer.max_len_single_sentence)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
     model.to(args.device)
 
     if args.local_rank == 0:
@@ -546,7 +662,6 @@ def main():
         global_step, tr_loss = train(args, train_dataset, model, tokenizer)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-
     # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
@@ -556,35 +671,38 @@ def main():
         logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         model.to(args.device)
 
-
     # Evaluation
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
             result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
             results.update(result)
 
     return results
diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py
index 9d1ca7f300..bfa62cfb7f 100644
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -26,8 +26,7 @@ import random
 
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 
 try:
@@ -37,34 +36,38 @@ except:
 
 from tqdm import tqdm, trange
 
-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForMultipleChoice, BertTokenizer,
-                                  XLNetConfig, XLNetForMultipleChoice,
-                                  XLNetTokenizer, RobertaConfig,
-                                  RobertaForMultipleChoice, RobertaTokenizer)
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForMultipleChoice,
+    BertTokenizer,
+    XLNetConfig,
+    XLNetForMultipleChoice,
+    XLNetTokenizer,
+    RobertaConfig,
+    RobertaForMultipleChoice,
+    RobertaTokenizer,
+)
 
 from transformers import AdamW, get_linear_schedule_with_warmup
 
-from utils_multiple_choice import (convert_examples_to_features, processors)
+from utils_multiple_choice import convert_examples_to_features, processors
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig)), ())
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig)), ()
+)
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForMultipleChoice, BertTokenizer),
-    'xlnet': (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
-    'roberta': (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer)
+    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
+    "roberta": (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer),
 }
 
+
 def select_field(features, field):
-    return [
-        [
-            choice[field]
-            for choice in feature.choices_features
-        ]
-        for feature in features
-    ]
+    return [[choice[field] for choice in feature.choices_features] for feature in features]
 
 
 def simple_accuracy(preds, labels):
@@ -95,13 +98,18 @@ def train(args, train_dataset, model, tokenizer):
         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
     if args.fp16:
         try:
             from apex import amp
@@ -115,17 +123,21 @@ def train(args, train_dataset, model, tokenizer):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -141,15 +153,19 @@ def train(args, train_dataset, model, tokenizer):
         for step, batch in enumerate(epoch_iterator):
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
-                      'labels':         batch[3]}
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                "token_type_ids": batch[2]
+                if args.model_type in ["bert", "xlnet"]
+                else None,  # XLM don't use segment_ids
+                "labels": batch[3],
+            }
             outputs = model(**inputs)
             loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
             if args.gradient_accumulation_steps > 1:
                 loss = loss / args.gradient_accumulation_steps
 
@@ -171,10 +187,12 @@ def train(args, train_dataset, model, tokenizer):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                         if results["eval_acc"] > best_dev_acc:
                             best_dev_acc = results["eval_acc"]
                             best_dev_loss = results["eval_loss"]
@@ -182,22 +200,33 @@ def train(args, train_dataset, model, tokenizer):
                             if args.do_test:
                                 results_test = evaluate(args, model, tokenizer, test=True)
                                 for key, value in results_test.items():
-                                    tb_writer.add_scalar('test_{}'.format(key), value, global_step)
-                                logger.info("test acc: %s, loss: %s, global steps: %s", str(results_test['eval_acc']), str(results_test['eval_loss']), str(global_step))
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
-                    logger.info("Average loss: %s at global step: %s", str((tr_loss - logging_loss)/args.logging_steps), str(global_step))
+                                    tb_writer.add_scalar("test_{}".format(key), value, global_step)
+                                logger.info(
+                                    "test acc: %s, loss: %s, global steps: %s",
+                                    str(results_test["eval_acc"]),
+                                    str(results_test["eval_loss"]),
+                                    str(global_step),
+                                )
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
+                    logger.info(
+                        "Average loss: %s at global step: %s",
+                        str((tr_loss - logging_loss) / args.logging_steps),
+                        str(global_step),
+                    )
                     logging_loss = tr_loss
 
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                     # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
                     tokenizer.save_vocabulary(output_dir)
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
@@ -246,10 +275,14 @@ def evaluate(args, model, tokenizer, prefix="", test=False):
             batch = tuple(t.to(args.device) for t in batch)
 
             with torch.no_grad():
-                inputs = {'input_ids':      batch[0],
-                          'attention_mask': batch[1],
-                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
-                          'labels':         batch[3]}
+                inputs = {
+                    "input_ids": batch[0],
+                    "attention_mask": batch[1],
+                    "token_type_ids": batch[2]
+                    if args.model_type in ["bert", "xlnet"]
+                    else None,  # XLM don't use segment_ids
+                    "labels": batch[3],
+                }
                 outputs = model(**inputs)
                 tmp_eval_loss, logits = outputs[:2]
 
@@ -257,10 +290,10 @@ def evaluate(args, model, tokenizer, prefix="", test=False):
             nb_eval_steps += 1
             if preds is None:
                 preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs['labels'].detach().cpu().numpy()
+                out_label_ids = inputs["labels"].detach().cpu().numpy()
             else:
                 preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
 
         eval_loss = eval_loss / nb_eval_steps
         preds = np.argmax(preds, axis=1)
@@ -273,8 +306,14 @@ def evaluate(args, model, tokenizer, prefix="", test=False):
         with open(output_eval_file, "w") as writer:
             logger.info("***** Eval results {} *****".format(str(prefix) + " is test:" + str(test)))
             writer.write("model           =%s\n" % str(args.model_name_or_path))
-            writer.write("total batch size=%d\n" % (args.per_gpu_train_batch_size * args.gradient_accumulation_steps *
-                         (torch.distributed.get_world_size() if args.local_rank != -1 else 1)))
+            writer.write(
+                "total batch size=%d\n"
+                % (
+                    args.per_gpu_train_batch_size
+                    * args.gradient_accumulation_steps
+                    * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)
+                )
+            )
             writer.write("train num epochs=%d\n" % args.num_train_epochs)
             writer.write("fp16            =%s\n" % args.fp16)
             writer.write("max seq length  =%d\n" % args.max_seq_length)
@@ -291,17 +330,21 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
     processor = processors[task]()
     # Load data features from cache or dataset file
     if evaluate:
-        cached_mode = 'dev'
+        cached_mode = "dev"
     elif test:
-        cached_mode = 'test'
+        cached_mode = "test"
     else:
-        cached_mode = 'train'
+        cached_mode = "train"
     assert (evaluate == True and test == True) == False
-    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
-        cached_mode,
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length),
-        str(task)))
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}_{}".format(
+            cached_mode,
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+            str(task),
+        ),
+    )
     if os.path.exists(cached_features_file) and not args.overwrite_cache:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
@@ -320,8 +363,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
             label_list,
             args.max_seq_length,
             tokenizer,
-            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
-            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0
+            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
+            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
         )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
@@ -331,9 +374,9 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
 
     # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long)
-    all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long)
-    all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long)
+    all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
+    all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
+    all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
     all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long)
 
     dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
@@ -344,91 +387,150 @@ def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--task_name", default=None, type=str, required=True,
-                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
 
     ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set')
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Run evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_test", action="store_true", help="Whether to run test on the test set")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
 
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
 
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
 
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
     args = parser.parse_args()
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -440,16 +542,24 @@ def main():
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
         args.n_gpu = 1
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
@@ -468,17 +578,23 @@ def main():
 
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels,
-                                          finetuning_task=args.task_name,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -494,7 +610,6 @@ def main():
         global_step, tr_loss, best_steps = train(args, train_dataset, model, tokenizer)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
@@ -504,19 +619,20 @@ def main():
         logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
         tokenizer = tokenizer_class.from_pretrained(args.output_dir)
         model.to(args.device)
 
-
     # Evaluation
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
@@ -524,17 +640,19 @@ def main():
             args.output_dir = args.model_name_or_path
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
             result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
             results.update(result)
 
     if args.do_test and args.local_rank in [-1, 0]:
@@ -546,13 +664,13 @@ def main():
         #     logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
             result = evaluate(args, model, tokenizer, prefix=prefix, test=True)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
             results.update(result)
     if best_steps:
         logger.info("best steps of eval acc is the following checkpoints: %s", best_steps)
diff --git a/examples/run_ner.py b/examples/run_ner.py
index 0fdaacf2aa..48ac61b4fe 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -43,9 +43,12 @@ from transformers import XLMRobertaConfig, XLMRobertaForTokenClassification, XLM
 logger = logging.getLogger(__name__)
 
 ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig,
-                                                                   CamembertConfig, XLMRobertaConfig)),
-    ())
+    (
+        tuple(conf.pretrained_config_archive_map.keys())
+        for conf in (BertConfig, RobertaConfig, DistilBertConfig, CamembertConfig, XLMRobertaConfig)
+    ),
+    (),
+)
 
 MODEL_CLASSES = {
     "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
@@ -82,18 +85,24 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
     # Prepare optimizer and schedule (linear warmup and decay)
     no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-         "weight_decay": args.weight_decay},
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
     ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
 
     # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
         # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
 
     if args.fp16:
         try:
@@ -108,18 +117,21 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                args.train_batch_size * args.gradient_accumulation_steps * (
-                    torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -129,7 +141,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
     # Check if continuing training from a checkpoint
     if os.path.exists(args.model_name_or_path):
         # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
+        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
         epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
         steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
 
@@ -140,7 +152,9 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
 
     tr_loss, logging_loss = 0.0, 0.0
     model.zero_grad()
-    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
     set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
@@ -153,11 +167,11 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
 
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
-            inputs = {"input_ids": batch[0],
-                      "attention_mask": batch[1],
-                      "labels": batch[3]}
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
             if args.model_type != "distilbert":
-                inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None  # XLM and RoBERTa don"t use segment_ids
+                inputs["token_type_ids"] = (
+                    batch[2] if args.model_type in ["bert", "xlnet"] else None
+                )  # XLM and RoBERTa don"t use segment_ids
 
             outputs = model(**inputs)
             loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
@@ -187,7 +201,9 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev")
                         for key, value in results.items():
                             tb_writer.add_scalar("eval_{}".format(key), value, global_step)
@@ -200,15 +216,17 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
                     output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
                     tokenizer.save_pretrained(output_dir)
 
                     torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                     logger.info("Saving optimizer and scheduler states to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
@@ -249,11 +267,11 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
         batch = tuple(t.to(args.device) for t in batch)
 
         with torch.no_grad():
-            inputs = {"input_ids": batch[0],
-                      "attention_mask": batch[1],
-                      "labels": batch[3]}
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
             if args.model_type != "distilbert":
-                inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None  # XLM and RoBERTa don"t use segment_ids
+                inputs["token_type_ids"] = (
+                    batch[2] if args.model_type in ["bert", "xlnet"] else None
+                )  # XLM and RoBERTa don"t use segment_ids
             outputs = model(**inputs)
             tmp_eval_loss, logits = outputs[:2]
 
@@ -287,7 +305,7 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
         "loss": eval_loss,
         "precision": precision_score(out_label_list, preds_list),
         "recall": recall_score(out_label_list, preds_list),
-        "f1": f1_score(out_label_list, preds_list)
+        "f1": f1_score(out_label_list, preds_list),
     }
 
     logger.info("***** Eval results %s *****", prefix)
@@ -302,29 +320,36 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
 
     # Load data features from cache or dataset file
-    cached_features_file = os.path.join(args.data_dir, "cached_{}_{}_{}".format(mode,
-        list(filter(None, args.model_name_or_path.split("/"))).pop(),
-        str(args.max_seq_length)))
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}".format(
+            mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length)
+        ),
+    )
     if os.path.exists(cached_features_file) and not args.overwrite_cache:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", args.data_dir)
         examples = read_examples_from_file(args.data_dir, mode)
-        features = convert_examples_to_features(examples, labels, args.max_seq_length, tokenizer,
-                                                cls_token_at_end=bool(args.model_type in ["xlnet"]),
-                                                # xlnet has a cls token at the end
-                                                cls_token=tokenizer.cls_token,
-                                                cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
-                                                sep_token=tokenizer.sep_token,
-                                                sep_token_extra=bool(args.model_type in ["roberta"]),
-                                                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
-                                                pad_on_left=bool(args.model_type in ["xlnet"]),
-                                                # pad on the left for xlnet
-                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-                                                pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
-                                                pad_token_label_id=pad_token_label_id
-                                                )
+        features = convert_examples_to_features(
+            examples,
+            labels,
+            args.max_seq_length,
+            tokenizer,
+            cls_token_at_end=bool(args.model_type in ["xlnet"]),
+            # xlnet has a cls token at the end
+            cls_token=tokenizer.cls_token,
+            cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
+            sep_token=tokenizer.sep_token,
+            sep_token_extra=bool(args.model_type in ["roberta"]),
+            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
+            pad_on_left=bool(args.model_type in ["xlnet"]),
+            # pad on the left for xlnet
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+            pad_token_label_id=pad_token_label_id,
+        )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
@@ -346,95 +371,151 @@ def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
 
     ## Other parameters
-    parser.add_argument("--labels", default="", type=str,
-                        help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--do_train", action="store_true",
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true",
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_predict", action="store_true",
-                        help="Whether to run predictions on the test set.")
-    parser.add_argument("--evaluate_during_training", action="store_true",
-                        help="Whether to run evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action="store_true",
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--labels",
+        default="",
+        type=str,
+        help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
+    )
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
+    parser.add_argument(
+        "--evaluate_during_training",
+        action="store_true",
+        help="Whether to run evaluation during training at each logging step.",
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
 
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument("--gradient_accumulation_steps", type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
 
-    parser.add_argument("--logging_steps", type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action="store_true",
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action="store_true",
-                        help="Avoid using CUDA when available")
-    parser.add_argument("--overwrite_output_dir", action="store_true",
-                        help="Overwrite the content of the output directory")
-    parser.add_argument("--overwrite_cache", action="store_true",
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument("--seed", type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
 
-    parser.add_argument("--fp16", action="store_true",
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument("--fp16_opt_level", type=str, default="O1",
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
     parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
     parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
     args = parser.parse_args()
 
-    if os.path.exists(args.output_dir) and os.listdir(
-            args.output_dir) and args.do_train and not args.overwrite_output_dir:
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
         raise ValueError(
             "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir))
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -451,11 +532,19 @@ def main():
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-                        datefmt="%m/%d/%Y %H:%M:%S",
-                        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                   args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
@@ -472,16 +561,22 @@ def main():
 
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool(".ckpt" in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -505,7 +600,9 @@ def main():
         logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
@@ -518,7 +615,9 @@ def main():
         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
             logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
@@ -565,4 +664,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 18a5a1c23f..1580a31e85 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -17,7 +17,11 @@
 
 from __future__ import absolute_import, division, print_function
 from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor, SquadResult
-from transformers.data.metrics.squad_metrics import compute_predictions_logits, compute_predictions_log_probs, squad_evaluate
+from transformers.data.metrics.squad_metrics import (
+    compute_predictions_logits,
+    compute_predictions_log_probs,
+    squad_evaluate,
+)
 
 import argparse
 import logging
@@ -27,8 +31,7 @@ import glob
 import timeit
 import numpy as np
 import torch
-from torch.utils.data import (
-    DataLoader, RandomSampler, SequentialSampler, TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 
 try:
@@ -38,32 +41,47 @@ except:
 
 from tqdm import tqdm, trange
 
-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForQuestionAnswering, BertTokenizer,
-                                  RobertaForQuestionAnswering, RobertaTokenizer, RobertaConfig,
-                                  XLMConfig, XLMForQuestionAnswering,
-                                  XLMTokenizer, XLNetConfig,
-                                  XLNetForQuestionAnswering,
-                                  XLNetTokenizer,
-                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer,
-                                  AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer,
-                                  XLMConfig, XLMForQuestionAnswering, XLMTokenizer,
-                                  )
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForQuestionAnswering,
+    BertTokenizer,
+    RobertaForQuestionAnswering,
+    RobertaTokenizer,
+    RobertaConfig,
+    XLMConfig,
+    XLMForQuestionAnswering,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetForQuestionAnswering,
+    XLNetTokenizer,
+    DistilBertConfig,
+    DistilBertForQuestionAnswering,
+    DistilBertTokenizer,
+    AlbertConfig,
+    AlbertForQuestionAnswering,
+    AlbertTokenizer,
+    XLMConfig,
+    XLMForQuestionAnswering,
+    XLMTokenizer,
+)
 
 from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
-                  for conf in (BertConfig, RobertaConfig, XLNetConfig, XLMConfig)), ())
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, XLNetConfig, XLMConfig)),
+    (),
+)
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
-    'roberta': (RobertaConfig, RobertaForQuestionAnswering, RobertaTokenizer),
-    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
-    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
-    'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
+    "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
+    "roberta": (RobertaConfig, RobertaForQuestionAnswering, RobertaTokenizer),
+    "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
+    "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
+    "albert": (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
 }
 
 
@@ -85,49 +103,44 @@ def train(args, train_dataset, model, tokenizer):
         tb_writer = SummaryWriter()
 
     args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(
-        train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(
-        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
 
     if args.max_steps > 0:
         t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (
-            len(train_dataloader) // args.gradient_accumulation_steps) + 1
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
     else:
-        t_total = len(
-            train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(
-            nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(
-            nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
     ]
-    optimizer = AdamW(optimizer_grouped_parameters,
-                      lr=args.learning_rate, eps=args.adam_epsilon)
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
     scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
 
     # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
         # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(
-            os.path.join(args.model_name_or_path, 'optimizer.pt')))
-        scheduler.load_state_dict(torch.load(
-            os.path.join(args.model_name_or_path, 'scheduler.pt')))
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
 
     if args.fp16:
         try:
             from apex import amp
         except ImportError:
-            raise ImportError(
-                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
 
-        model, optimizer = amp.initialize(
-            model, optimizer, opt_level=args.fp16_opt_level)
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
 
     # multi-gpu training (should be after apex fp16 initialization)
     if args.n_gpu > 1:
@@ -135,20 +148,22 @@ def train(args, train_dataset, model, tokenizer):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d",
-                args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
-    logger.info("  Gradient Accumulation steps = %d",
-                args.gradient_accumulation_steps)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
     global_step = 1
@@ -157,29 +172,25 @@ def train(args, train_dataset, model, tokenizer):
     # Check if continuing training from a checkpoint
     if os.path.exists(args.model_name_or_path):
         # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
-        epochs_trained = global_step // (len(train_dataloader) //
-                                         args.gradient_accumulation_steps)
-        steps_trained_in_current_epoch = global_step % (
-            len(train_dataloader) // args.gradient_accumulation_steps)
+        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
+        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
+        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
 
-        logger.info(
-            "  Continuing training from checkpoint, will skip to saved global_step")
+        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
         logger.info("  Continuing training from epoch %d", epochs_trained)
         logger.info("  Continuing training from global step %d", global_step)
-        logger.info("  Will skip the first %d steps in the first epoch",
-                    steps_trained_in_current_epoch)
+        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
 
     tr_loss, logging_loss = 0.0, 0.0
     model.zero_grad()
-    train_iterator = trange(epochs_trained, int(
-        args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
     # Added here for reproductibility (even between python 2 and 3)
     set_seed(args)
 
     for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration",
-                              disable=args.local_rank not in [-1, 0])
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
 
             # Skip past any already trained steps if resuming training
@@ -191,18 +202,17 @@ def train(args, train_dataset, model, tokenizer):
             batch = tuple(t.to(args.device) for t in batch)
 
             inputs = {
-                'input_ids':       batch[0],
-                'attention_mask':  batch[1],
-                'token_type_ids': None if args.model_type in ['xlm', 'roberta', 'distilbert'] else batch[2],
-                'start_positions': batch[3],
-                'end_positions':   batch[4],
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                "token_type_ids": None if args.model_type in ["xlm", "roberta", "distilbert"] else batch[2],
+                "start_positions": batch[3],
+                "end_positions": batch[4],
             }
 
-            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[5],
-                               'p_mask':       batch[6]})
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
                 if args.version_2_with_negative:
-                    inputs.update({'is_impossible': batch[7]})
+                    inputs.update({"is_impossible": batch[7]})
             outputs = model(**inputs)
             # model outputs are always tuple in transformers (see doc)
             loss = outputs[0]
@@ -221,11 +231,9 @@ def train(args, train_dataset, model, tokenizer):
             tr_loss += loss.item()
             if (step + 1) % args.gradient_accumulation_steps == 0:
                 if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(
-                        amp.master_params(optimizer), args.max_grad_norm)
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                 else:
-                    torch.nn.utils.clip_grad_norm_(
-                        model.parameters(), args.max_grad_norm)
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
                 optimizer.step()
                 scheduler.step()  # Update learning rate schedule
@@ -238,36 +246,27 @@ def train(args, train_dataset, model, tokenizer):
                     if args.local_rank == -1 and args.evaluate_during_training:
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
-                            tb_writer.add_scalar(
-                                'eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar(
-                        'lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar(
-                        'loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                     logging_loss = tr_loss
 
                 # Save model checkpoint
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    output_dir = os.path.join(
-                        args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
                     # Take care of distributed/parallel training
-                    model_to_save = model.module if hasattr(
-                        model, 'module') else model
+                    model_to_save = model.module if hasattr(model, "module") else model
                     model_to_save.save_pretrained(output_dir)
                     tokenizer.save_pretrained(output_dir)
 
-                    torch.save(args, os.path.join(
-                        output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
-                    torch.save(optimizer.state_dict(), os.path.join(
-                        output_dir, 'optimizer.pt'))
-                    torch.save(scheduler.state_dict(), os.path.join(
-                        output_dir, 'scheduler.pt'))
-                    logger.info(
-                        "Saving optimizer and scheduler states to %s", output_dir)
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
+                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
                 epoch_iterator.close()
@@ -283,8 +282,7 @@ def train(args, train_dataset, model, tokenizer):
 
 
 def evaluate(args, model, tokenizer, prefix=""):
-    dataset, examples, features = load_and_cache_examples(
-        args, tokenizer, evaluate=True, output_examples=True)
+    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
 
     if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
         os.makedirs(args.output_dir)
@@ -293,8 +291,7 @@ def evaluate(args, model, tokenizer, prefix=""):
 
     # Note that DistributedSampler samples randomly
     eval_sampler = SequentialSampler(dataset)
-    eval_dataloader = DataLoader(
-        dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
     # multi-gpu evaluate
     if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
@@ -314,15 +311,15 @@ def evaluate(args, model, tokenizer, prefix=""):
 
         with torch.no_grad():
             inputs = {
-                'input_ids':      batch[0],
-                'attention_mask': batch[1],
-                'token_type_ids': None if args.model_type in ['xlm', 'roberta', 'distilbert'] else batch[2],
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                "token_type_ids": None if args.model_type in ["xlm", "roberta", "distilbert"] else batch[2],
             }
             example_indices = batch[3]
 
             # XLNet and XLM use more arguments for their predictions
-            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
 
             outputs = model(**inputs)
 
@@ -342,53 +339,68 @@ def evaluate(args, model, tokenizer, prefix=""):
                 cls_logits = output[4]
 
                 result = SquadResult(
-                    unique_id, start_logits, end_logits,
+                    unique_id,
+                    start_logits,
+                    end_logits,
                     start_top_index=start_top_index,
                     end_top_index=end_top_index,
-                    cls_logits=cls_logits
+                    cls_logits=cls_logits,
                 )
 
             else:
                 start_logits, end_logits = output
-                result = SquadResult(
-                    unique_id, start_logits, end_logits
-                )
+                result = SquadResult(unique_id, start_logits, end_logits)
 
             all_results.append(result)
 
     evalTime = timeit.default_timer() - start_time
-    logger.info("  Evaluation done in total %f secs (%f sec per example)",
-                evalTime, evalTime / len(dataset))
+    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))
 
     # Compute predictions
-    output_prediction_file = os.path.join(
-        args.output_dir, "predictions_{}.json".format(prefix))
-    output_nbest_file = os.path.join(
-        args.output_dir, "nbest_predictions_{}.json".format(prefix))
+    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
+    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
 
     if args.version_2_with_negative:
-        output_null_log_odds_file = os.path.join(
-            args.output_dir, "null_odds_{}.json".format(prefix))
+        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
     else:
         output_null_log_odds_file = None
 
     # XLNet and XLM use a more complex post-processing procedure
-    if args.model_type in ['xlnet', 'xlm']:
-        start_n_top = model.config.start_n_top if hasattr(
-            model, "config") else model.module.config.start_n_top
-        end_n_top = model.config.end_n_top if hasattr(
-            model, "config") else model.module.config.end_n_top
+    if args.model_type in ["xlnet", "xlm"]:
+        start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
+        end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top
 
-        predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size,
-                                                    args.max_answer_length, output_prediction_file,
-                                                    output_nbest_file, output_null_log_odds_file,
-                                                    start_n_top, end_n_top,
-                                                    args.version_2_with_negative, tokenizer, args.verbose_logging)
+        predictions = compute_predictions_log_probs(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            start_n_top,
+            end_n_top,
+            args.version_2_with_negative,
+            tokenizer,
+            args.verbose_logging,
+        )
     else:
-        predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size,
-                        args.max_answer_length, args.do_lower_case, output_prediction_file,
-                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
-                        args.version_2_with_negative, args.null_score_diff_threshold, tokenizer)
+        predictions = compute_predictions_logits(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            args.do_lower_case,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            args.verbose_logging,
+            args.version_2_with_negative,
+            args.null_score_diff_threshold,
+            tokenizer,
+        )
 
     # Compute the F1 and exact scores.
     results = squad_evaluate(examples, predictions)
@@ -402,16 +414,18 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
 
     # Load data features from cache or dataset file
     input_dir = args.data_dir if args.data_dir else "."
-    cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format(
-        'dev' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length))
+    cached_features_file = os.path.join(
+        input_dir,
+        "cached_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+        ),
     )
 
     # Init features and dataset from cache if it exists
     if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
-        logger.info("Loading features from cached file %s",
-                    cached_features_file)
+        logger.info("Loading features from cached file %s", cached_features_file)
         features_and_dataset = torch.load(cached_features_file)
         features, dataset = features_and_dataset["features"], features_and_dataset["dataset"]
     else:
@@ -421,16 +435,13 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
             try:
                 import tensorflow_datasets as tfds
             except ImportError:
-                raise ImportError(
-                    "If not data_dir is specified, tensorflow_datasets needs to be installed.")
+                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")
 
             if args.version_2_with_negative:
-                logger.warn(
-                    "tensorflow_datasets does not handle version 2 of SQuAD.")
+                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")
 
             tfds_examples = tfds.load("squad")
-            examples = SquadV1Processor().get_examples_from_dataset(
-                tfds_examples, evaluate=evaluate)
+            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
         else:
             processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
             if evaluate:
@@ -445,15 +456,13 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
             doc_stride=args.doc_stride,
             max_query_length=args.max_query_length,
             is_training=not evaluate,
-            return_dataset='pt',
+            return_dataset="pt",
             threads=args.threads,
         )
 
         if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s",
-                        cached_features_file)
-            torch.save({"features": features, "dataset": dataset},
-                       cached_features_file)
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save({"features": features, "dataset": dataset}, cached_features_file)
 
     if args.local_rank == 0 and not evaluate:
         # Make sure only the first process in distributed training process the dataset, and the others will use the cache
@@ -468,140 +477,232 @@ def main():
     parser = argparse.ArgumentParser()
 
     # Required parameters
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model checkpoints and predictions will be written.")
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model checkpoints and predictions will be written.",
+    )
 
     # Other parameters
-    parser.add_argument("--data_dir", default=None, type=str,
-                        help="The input data dir. Should contain the .json files for the task." +
-                             "If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
-    parser.add_argument("--train_file", default=None, type=str,
-                        help="The input training file. If a data dir is specified, will look for the file there" +
-                             "If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
-    parser.add_argument("--predict_file", default=None, type=str,
-                        help="The input evaluation file. If a data dir is specified, will look for the file there" +
-                             "If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        help="The input data dir. Should contain the .json files for the task."
+        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
+    )
+    parser.add_argument(
+        "--train_file",
+        default=None,
+        type=str,
+        help="The input training file. If a data dir is specified, will look for the file there"
+        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
+    )
+    parser.add_argument(
+        "--predict_file",
+        default=None,
+        type=str,
+        help="The input evaluation file. If a data dir is specified, will look for the file there"
+        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
+    )
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
 
-    parser.add_argument('--version_2_with_negative', action='store_true',
-                        help='If true, the SQuAD examples contain some that do not have an answer.')
-    parser.add_argument('--null_score_diff_threshold', type=float, default=0.0,
-                        help="If null_score - best_non_null is greater than the threshold predict null.")
+    parser.add_argument(
+        "--version_2_with_negative",
+        action="store_true",
+        help="If true, the SQuAD examples contain some that do not have an answer.",
+    )
+    parser.add_argument(
+        "--null_score_diff_threshold",
+        type=float,
+        default=0.0,
+        help="If null_score - best_non_null is greater than the threshold predict null.",
+    )
 
-    parser.add_argument("--max_seq_length", default=384, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
-                             "longer than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--doc_stride", default=128, type=int,
-                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
-    parser.add_argument("--max_query_length", default=64, type=int,
-                        help="The maximum number of tokens for the question. Questions longer than this will "
-                             "be truncated to this length.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--max_seq_length",
+        default=384,
+        type=int,
+        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+        "longer than this will be truncated, and sequences shorter than this will be padded.",
+    )
+    parser.add_argument(
+        "--doc_stride",
+        default=128,
+        type=int,
+        help="When splitting up a long document into chunks, how much stride to take between chunks.",
+    )
+    parser.add_argument(
+        "--max_query_length",
+        default=64,
+        type=int,
+        help="The maximum number of tokens for the question. Questions longer than this will "
+        "be truncated to this length.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
 
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-    parser.add_argument("--n_best_size", default=20, type=int,
-                        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
-    parser.add_argument("--max_answer_length", default=30, type=int,
-                        help="The maximum length of an answer that can be generated. This is needed because the start "
-                             "and end predictions are not conditioned on one another.")
-    parser.add_argument("--verbose_logging", action='store_true',
-                        help="If true, all of the warnings related to data processing will be printed. "
-                             "A number of warnings are expected for a normal SQuAD evaluation.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+    parser.add_argument(
+        "--n_best_size",
+        default=20,
+        type=int,
+        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
+    )
+    parser.add_argument(
+        "--max_answer_length",
+        default=30,
+        type=int,
+        help="The maximum length of an answer that can be generated. This is needed because the start "
+        "and end predictions are not conditioned on one another.",
+    )
+    parser.add_argument(
+        "--verbose_logging",
+        action="store_true",
+        help="If true, all of the warnings related to data processing will be printed. "
+        "A number of warnings are expected for a normal SQuAD evaluation.",
+    )
 
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
 
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
 
-    parser.add_argument('--threads', type=int, default=1, help='multiple threads for converting example to features')
+    parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")
     args = parser.parse_args()
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
         raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
-        ptvsd.enable_attach(
-            address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
 
     # Setup CUDA, GPU & distributed training
     if args.local_rank == -1 or args.no_cuda:
-        device = torch.device(
-            "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
         args.n_gpu = torch.cuda.device_count()
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
         args.n_gpu = 1
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt='%m/%d/%Y %H:%M:%S',
-                        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                   args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
@@ -613,16 +714,21 @@ def main():
 
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool(
-                                            '.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
 
     if args.local_rank == 0:
         # Make sure only the first process in distributed training will download model & vocab
@@ -638,18 +744,16 @@ def main():
     if args.fp16:
         try:
             import apex
-            apex.amp.register_half_function(torch, 'einsum')
+
+            apex.amp.register_half_function(torch, "einsum")
         except ImportError:
-            raise ImportError(
-                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
 
     # Training
     if args.do_train:
-        train_dataset = load_and_cache_examples(
-            args, tokenizer, evaluate=False, output_examples=False)
+        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
         global_step, tr_loss = train(args, train_dataset, model, tokenizer)
-        logger.info(" global_step = %s, average loss = %s",
-                    global_step, tr_loss)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
     # Save the trained model and the tokenizer
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
@@ -661,18 +765,16 @@ def main():
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
         # Take care of distributed/parallel training
-        model_to_save = model.module if hasattr(model, 'module') else model
+        model_to_save = model.module if hasattr(model, "module") else model
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
 
         # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(
-            args.output_dir, force_download=True)
-        tokenizer = tokenizer_class.from_pretrained(
-            args.output_dir, do_lower_case=args.do_lower_case)
+        model = model_class.from_pretrained(args.output_dir, force_download=True)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         model.to(args.device)
 
     # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
@@ -682,7 +784,10 @@ def main():
             logger.info("Loading checkpoints saved during training for evaluation")
             checkpoints = [args.output_dir]
             if args.eval_all_checkpoints:
-                checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+                checkpoints = list(
+                    os.path.dirname(c)
+                    for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+                )
                 logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
         else:
             logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
@@ -692,17 +797,14 @@ def main():
 
         for checkpoint in checkpoints:
             # Reload the model
-            global_step = checkpoint.split(
-                '-')[-1] if len(checkpoints) > 1 else ""
-            model = model_class.from_pretrained(
-                checkpoint, force_download=True)
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            model = model_class.from_pretrained(checkpoint, force_download=True)
             model.to(args.device)
 
             # Evaluate
             result = evaluate(args, model, tokenizer, prefix=global_step)
 
-            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v)
-                          for k, v in result.items())
+            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
             results.update(result)
 
     logger.info("Results: {}".format(results))
diff --git a/examples/run_tf_glue.py b/examples/run_tf_glue.py
index 54282277d2..74a6db34ad 100644
--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
@@ -1,7 +1,14 @@
 import os
 import tensorflow as tf
 import tensorflow_datasets
-from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig, glue_convert_examples_to_features, BertForSequenceClassification, glue_processors
+from transformers import (
+    BertTokenizer,
+    TFBertForSequenceClassification,
+    BertConfig,
+    glue_convert_examples_to_features,
+    BertForSequenceClassification,
+    glue_processors,
+)
 
 # script parameters
 BATCH_SIZE = 32
@@ -16,7 +23,7 @@ if TASK == "sst-2":
     TFDS_TASK = "sst2"
 elif TASK == "sts-b":
     TFDS_TASK = "stsb"
-else: 
+else:
     TFDS_TASK = TASK
 
 num_labels = len(glue_processors[TASK]().get_labels())
@@ -27,29 +34,29 @@ tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})
 
 # Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
 config = BertConfig.from_pretrained("bert-base-cased", num_labels=num_labels)
-tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', config=config)
+tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+model = TFBertForSequenceClassification.from_pretrained("bert-base-cased", config=config)
 
 # Load dataset via TensorFlow Datasets
-data, info = tensorflow_datasets.load(f'glue/{TFDS_TASK}', with_info=True)
-train_examples = info.splits['train'].num_examples
+data, info = tensorflow_datasets.load(f"glue/{TFDS_TASK}", with_info=True)
+train_examples = info.splits["train"].num_examples
 
 # MNLI expects either validation_matched or validation_mismatched
-valid_examples = info.splits['validation'].num_examples
+valid_examples = info.splits["validation"].num_examples
 
 # Prepare dataset for GLUE as a tf.data.Dataset instance
-train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, TASK)
+train_dataset = glue_convert_examples_to_features(data["train"], tokenizer, 128, TASK)
 
 # MNLI expects either validation_matched or validation_mismatched
-valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, TASK)
+valid_dataset = glue_convert_examples_to_features(data["validation"], tokenizer, 128, TASK)
 train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
 valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)
 
-# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
+# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
 opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
 if USE_AMP:
     # loss scaling is currently required when using mixed precision
-    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
+    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic")
 
 
 if num_labels == 1:
@@ -57,37 +64,42 @@ if num_labels == 1:
 else:
     loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
 
-metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
 model.compile(optimizer=opt, loss=loss, metrics=[metric])
 
 # Train and evaluate using tf.keras.Model.fit()
-train_steps = train_examples//BATCH_SIZE
-valid_steps = valid_examples//EVAL_BATCH_SIZE
+train_steps = train_examples // BATCH_SIZE
+valid_steps = valid_examples // EVAL_BATCH_SIZE
 
-history = model.fit(train_dataset, epochs=EPOCHS, steps_per_epoch=train_steps,
-                    validation_data=valid_dataset, validation_steps=valid_steps)
+history = model.fit(
+    train_dataset,
+    epochs=EPOCHS,
+    steps_per_epoch=train_steps,
+    validation_data=valid_dataset,
+    validation_steps=valid_steps,
+)
 
 # Save TF2 model
-os.makedirs('./save/', exist_ok=True)
-model.save_pretrained('./save/')
+os.makedirs("./save/", exist_ok=True)
+model.save_pretrained("./save/")
 
 if TASK == "mrpc":
     # Load the TensorFlow model in PyTorch for inspection
-    # This is to demo the interoperability between the two frameworks, you don't have to 
+    # This is to demo the interoperability between the two frameworks, you don't have to
     # do this in real life (you can run the inference on the TF model).
-    pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
+    pytorch_model = BertForSequenceClassification.from_pretrained("./save/", from_tf=True)
 
     # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
-    sentence_0 = 'This research was consistent with his findings.'
-    sentence_1 = 'His findings were compatible with this research.'
-    sentence_2 = 'His findings were not compatible with this research.'
-    inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
-    inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
+    sentence_0 = "This research was consistent with his findings."
+    sentence_1 = "His findings were compatible with this research."
+    sentence_2 = "His findings were not compatible with this research."
+    inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors="pt")
+    inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors="pt")
 
     del inputs_1["special_tokens_mask"]
     del inputs_2["special_tokens_mask"]
 
     pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
     pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
-    print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0')
-    print('sentence_2 is', 'a paraphrase' if pred_2 else 'not a paraphrase', 'of sentence_0')
+    print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
+    print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
diff --git a/examples/run_tf_ner.py b/examples/run_tf_ner.py
index eb284f4c2a..77850d1ab5 100644
--- a/examples/run_tf_ner.py
+++ b/examples/run_tf_ner.py
@@ -21,189 +21,156 @@ from absl import app
 
 
 ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
-    ())
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)), ()
+)
 
 MODEL_CLASSES = {
     "bert": (BertConfig, TFBertForTokenClassification, BertTokenizer),
     "roberta": (RobertaConfig, TFRobertaForTokenClassification, RobertaTokenizer),
-    "distilbert": (DistilBertConfig, TFDistilBertForTokenClassification, DistilBertTokenizer)
+    "distilbert": (DistilBertConfig, TFDistilBertForTokenClassification, DistilBertTokenizer),
 }
 
 
 flags.DEFINE_string(
-    "data_dir", None,
-    "The input data dir. Should contain the .conll files (or other data files) "
-    "for the task.")
+    "data_dir", None, "The input data dir. Should contain the .conll files (or other data files) " "for the task."
+)
+
+flags.DEFINE_string("model_type", None, "Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
 
 flags.DEFINE_string(
-    "model_type", None,
-    "Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    "model_name_or_path",
+    None,
+    "Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+)
+
+flags.DEFINE_string("output_dir", None, "The output directory where the model checkpoints will be written.")
 
 flags.DEFINE_string(
-    "model_name_or_path", None,
-    "Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+    "labels", "", "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."
+)
 
-flags.DEFINE_string(
-    "output_dir", None,
-    "The output directory where the model checkpoints will be written.")
+flags.DEFINE_string("config_name", "", "Pretrained config name or path if not the same as model_name")
 
-flags.DEFINE_string(
-    "labels", "",
-    "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
+flags.DEFINE_string("tokenizer_name", "", "Pretrained tokenizer name or path if not the same as model_name")
 
-flags.DEFINE_string(
-    "config_name", "",
-    "Pretrained config name or path if not the same as model_name")
-
-flags.DEFINE_string(
-    "tokenizer_name", "",
-    "Pretrained tokenizer name or path if not the same as model_name")
-
-flags.DEFINE_string(
-    "cache_dir", "",
-    "Where do you want to store the pre-trained models downloaded from s3")
+flags.DEFINE_string("cache_dir", "", "Where do you want to store the pre-trained models downloaded from s3")
 
 flags.DEFINE_integer(
-    "max_seq_length", 128,
+    "max_seq_length",
+    128,
     "The maximum total input sentence length after tokenization. "
     "Sequences longer than this will be truncated, sequences shorter "
-    "will be padded.")
+    "will be padded.",
+)
 
 flags.DEFINE_string(
-    "tpu", None,
+    "tpu",
+    None,
     "The Cloud TPU to use for training. This should be either the name "
     "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
-    "url.")
+    "url.",
+)
 
-flags.DEFINE_integer(
-    "num_tpu_cores", 8,
-    "Total number of TPU cores to use.")
+flags.DEFINE_integer("num_tpu_cores", 8, "Total number of TPU cores to use.")
+
+flags.DEFINE_boolean("do_train", False, "Whether to run training.")
+
+flags.DEFINE_boolean("do_eval", False, "Whether to run eval on the dev set.")
+
+flags.DEFINE_boolean("do_predict", False, "Whether to run predictions on the test set.")
 
 flags.DEFINE_boolean(
-    "do_train", False,
-    "Whether to run training.")
+    "evaluate_during_training", False, "Whether to run evaluation during training at each logging step."
+)
 
-flags.DEFINE_boolean(
-    "do_eval", False,
-    "Whether to run eval on the dev set.")
+flags.DEFINE_boolean("do_lower_case", False, "Set this flag if you are using an uncased model.")
 
-flags.DEFINE_boolean(
-    "do_predict", False,
-    "Whether to run predictions on the test set.")
+flags.DEFINE_integer("per_device_train_batch_size", 8, "Batch size per GPU/CPU/TPU for training.")
 
-flags.DEFINE_boolean(
-    "evaluate_during_training", False,
-    "Whether to run evaluation during training at each logging step.")
-
-flags.DEFINE_boolean(
-    "do_lower_case", False,
-    "Set this flag if you are using an uncased model.")
+flags.DEFINE_integer("per_device_eval_batch_size", 8, "Batch size per GPU/CPU/TPU for evaluation.")
 
 flags.DEFINE_integer(
-    "per_device_train_batch_size", 8,
-    "Batch size per GPU/CPU/TPU for training.")
+    "gradient_accumulation_steps", 1, "Number of updates steps to accumulate before performing a backward/update pass."
+)
+
+flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
+
+flags.DEFINE_float("weight_decay", 0.0, "Weight decay if we apply some.")
+
+flags.DEFINE_float("adam_epsilon", 1e-8, "Epsilon for Adam optimizer.")
+
+flags.DEFINE_float("max_grad_norm", 1.0, "Max gradient norm.")
+
+flags.DEFINE_integer("num_train_epochs", 3, "Total number of training epochs to perform.")
 
 flags.DEFINE_integer(
-    "per_device_eval_batch_size", 8,
-    "Batch size per GPU/CPU/TPU for evaluation.")
+    "max_steps", -1, "If > 0: set total number of training steps to perform. Override num_train_epochs."
+)
 
-flags.DEFINE_integer(
-    "gradient_accumulation_steps", 1,
-    "Number of updates steps to accumulate before performing a backward/update pass.")
+flags.DEFINE_integer("warmup_steps", 0, "Linear warmup over warmup_steps.")
 
-flags.DEFINE_float(
-    "learning_rate", 5e-5,
-    "The initial learning rate for Adam.")
+flags.DEFINE_integer("logging_steps", 50, "Log every X updates steps.")
 
-flags.DEFINE_float(
-    "weight_decay", 0.0,
-    "Weight decay if we apply some.")
-
-flags.DEFINE_float(
-    "adam_epsilon", 1e-8,
-    "Epsilon for Adam optimizer.")
-
-flags.DEFINE_float(
-    "max_grad_norm", 1.0,
-    "Max gradient norm.")
-
-flags.DEFINE_integer(
-    "num_train_epochs", 3,
-    "Total number of training epochs to perform.")
-
-flags.DEFINE_integer(
-    "max_steps", -1,
-    "If > 0: set total number of training steps to perform. Override num_train_epochs.")
-
-flags.DEFINE_integer(
-    "warmup_steps", 0,
-    "Linear warmup over warmup_steps.")
-
-flags.DEFINE_integer(
-    "logging_steps", 50,
-    "Log every X updates steps.")
-
-flags.DEFINE_integer(
-    "save_steps", 50,
-    "Save checkpoint every X updates steps.")
+flags.DEFINE_integer("save_steps", 50, "Save checkpoint every X updates steps.")
 
 flags.DEFINE_boolean(
-    "eval_all_checkpoints", False,
-    "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+    "eval_all_checkpoints",
+    False,
+    "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+)
 
-flags.DEFINE_boolean(
-    "no_cuda", False,
-    "Avoid using CUDA when available")
+flags.DEFINE_boolean("no_cuda", False, "Avoid using CUDA when available")
 
-flags.DEFINE_boolean(
-    "overwrite_output_dir", False,
-    "Overwrite the content of the output directory")
+flags.DEFINE_boolean("overwrite_output_dir", False, "Overwrite the content of the output directory")
 
-flags.DEFINE_boolean(
-    "overwrite_cache", False,
-    "Overwrite the cached training and evaluation sets")
+flags.DEFINE_boolean("overwrite_cache", False, "Overwrite the cached training and evaluation sets")
 
-flags.DEFINE_integer(
-    "seed", 42,
-    "random seed for initialization")
+flags.DEFINE_integer("seed", 42, "random seed for initialization")
 
-flags.DEFINE_boolean(
-    "fp16", False,
-    "Whether to use 16-bit (mixed) precision instead of 32-bit")
+flags.DEFINE_boolean("fp16", False, "Whether to use 16-bit (mixed) precision instead of 32-bit")
 
 flags.DEFINE_string(
-    "gpus", "0",
+    "gpus",
+    "0",
     "Comma separated list of gpus devices. If only one, switch to single "
-    "gpu strategy, if None takes all the gpus available.")
+    "gpu strategy, if None takes all the gpus available.",
+)
 
 
-def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id):
-    if args['max_steps'] > 0:
-        num_train_steps = args['max_steps'] * args['gradient_accumulation_steps']
-        args['num_train_epochs'] = 1
+def train(
+    args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id
+):
+    if args["max_steps"] > 0:
+        num_train_steps = args["max_steps"] * args["gradient_accumulation_steps"]
+        args["num_train_epochs"] = 1
     else:
-        num_train_steps = math.ceil(num_train_examples / train_batch_size) // args['gradient_accumulation_steps'] * args['num_train_epochs']
+        num_train_steps = (
+            math.ceil(num_train_examples / train_batch_size)
+            // args["gradient_accumulation_steps"]
+            * args["num_train_epochs"]
+        )
 
     writer = tf.summary.create_file_writer("/tmp/mylogs")
 
     with strategy.scope():
         loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
-        optimizer = create_optimizer(args['learning_rate'], num_train_steps, args['warmup_steps'])
+        optimizer = create_optimizer(args["learning_rate"], num_train_steps, args["warmup_steps"])
 
-        if args['fp16']:
-            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
+        if args["fp16"]:
+            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
 
-        loss_metric = tf.keras.metrics.Mean(name='loss', dtype=tf.float32)
+        loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32)
         gradient_accumulator = GradientAccumulator()
-        
+
     logging.info("***** Running training *****")
     logging.info("  Num examples = %d", num_train_examples)
-    logging.info("  Num Epochs = %d", args['num_train_epochs'])
-    logging.info("  Instantaneous batch size per device = %d", args['per_device_train_batch_size'])
-    logging.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                train_batch_size * args['gradient_accumulation_steps'])
-    logging.info("  Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
+    logging.info("  Num Epochs = %d", args["num_train_epochs"])
+    logging.info("  Instantaneous batch size per device = %d", args["per_device_train_batch_size"])
+    logging.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        train_batch_size * args["gradient_accumulation_steps"],
+    )
+    logging.info("  Gradient Accumulation steps = %d", args["gradient_accumulation_steps"])
     logging.info("  Total training steps = %d", num_train_steps)
 
     model.summary()
@@ -214,26 +181,28 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l
 
         for gradient, variable in zip(gradient_accumulator.gradients, model.trainable_variables):
             if gradient is not None:
-                scaled_gradient = gradient / (args['n_device'] * args['gradient_accumulation_steps'])
+                scaled_gradient = gradient / (args["n_device"] * args["gradient_accumulation_steps"])
                 grads_and_vars.append((scaled_gradient, variable))
             else:
                 grads_and_vars.append((gradient, variable))
 
-        optimizer.apply_gradients(grads_and_vars, args['max_grad_norm'])
+        optimizer.apply_gradients(grads_and_vars, args["max_grad_norm"])
         gradient_accumulator.reset()
 
     @tf.function
     def train_step(train_features, train_labels):
         def step_fn(train_features, train_labels):
-            inputs = {'attention_mask': train_features['input_mask'], 'training': True}
+            inputs = {"attention_mask": train_features["input_mask"], "training": True}
 
-            if args['model_type'] != "distilbert":
-                inputs["token_type_ids"] = train_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None
+            if args["model_type"] != "distilbert":
+                inputs["token_type_ids"] = (
+                    train_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None
+                )
 
             with tf.GradientTape() as tape:
-                logits = model(train_features['input_ids'], **inputs)[0]
+                logits = model(train_features["input_ids"], **inputs)[0]
                 logits = tf.reshape(logits, (-1, len(labels) + 1))
-                active_loss = tf.reshape(train_features['input_mask'], (-1,))
+                active_loss = tf.reshape(train_features["input_mask"], (-1,))
                 active_logits = tf.boolean_mask(logits, active_loss)
                 train_labels = tf.reshape(train_labels, (-1,))
                 active_labels = tf.boolean_mask(train_labels, active_loss)
@@ -251,34 +220,40 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l
         return mean_loss
 
     current_time = datetime.datetime.now()
-    train_iterator = master_bar(range(args['num_train_epochs']))
+    train_iterator = master_bar(range(args["num_train_epochs"]))
     global_step = 0
     logging_loss = 0.0
 
     for epoch in train_iterator:
-        epoch_iterator = progress_bar(train_dataset, total=num_train_steps, parent=train_iterator, display=args['n_device'] > 1)
+        epoch_iterator = progress_bar(
+            train_dataset, total=num_train_steps, parent=train_iterator, display=args["n_device"] > 1
+        )
         step = 1
 
         with strategy.scope():
             for train_features, train_labels in epoch_iterator:
                 loss = train_step(train_features, train_labels)
 
-                if step % args['gradient_accumulation_steps'] == 0:
+                if step % args["gradient_accumulation_steps"] == 0:
                     strategy.experimental_run_v2(apply_gradients)
 
                     loss_metric(loss)
 
                     global_step += 1
 
-                    if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
+                    if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0:
                         # Log metrics
-                        if args['n_device'] == 1 and args['evaluate_during_training']:  # Only evaluate when single GPU otherwise metrics may not average well
-                            y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
+                        if (
+                            args["n_device"] == 1 and args["evaluate_during_training"]
+                        ):  # Only evaluate when single GPU otherwise metrics may not average well
+                            y_true, y_pred, eval_loss = evaluate(
+                                args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev"
+                            )
                             report = metrics.classification_report(y_true, y_pred, digits=4)
-                            
+
                             logging.info("Eval at step " + str(global_step) + "\n" + report)
                             logging.info("eval_loss: " + str(eval_loss))
-                            
+
                             precision = metrics.precision_score(y_true, y_pred)
                             recall = metrics.recall_score(y_true, y_pred)
                             f1 = metrics.f1_score(y_true, y_pred)
@@ -288,33 +263,35 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l
                                 tf.summary.scalar("precision", precision, global_step)
                                 tf.summary.scalar("recall", recall, global_step)
                                 tf.summary.scalar("f1", f1, global_step)
-                        
+
                         lr = optimizer.learning_rate
                         learning_rate = lr(step)
 
                         with writer.as_default():
                             tf.summary.scalar("lr", learning_rate, global_step)
-                            tf.summary.scalar("loss", (loss_metric.result() - logging_loss) / args['logging_steps'], global_step)
-                        
+                            tf.summary.scalar(
+                                "loss", (loss_metric.result() - logging_loss) / args["logging_steps"], global_step
+                            )
+
                         logging_loss = loss_metric.result()
 
                     with writer.as_default():
                         tf.summary.scalar("loss", loss_metric.result(), step=step)
 
-                    if args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
+                    if args["save_steps"] > 0 and global_step % args["save_steps"] == 0:
                         # Save model checkpoint
-                        output_dir = os.path.join(args['output_dir'], "checkpoint-{}".format(global_step))
+                        output_dir = os.path.join(args["output_dir"], "checkpoint-{}".format(global_step))
 
                         if not os.path.exists(output_dir):
                             os.makedirs(output_dir)
-                        
+
                         model.save_pretrained(output_dir)
                         logging.info("Saving model checkpoint to %s", output_dir)
-                
-                train_iterator.child.comment = f'loss : {loss_metric.result()}'
+
+                train_iterator.child.comment = f"loss : {loss_metric.result()}"
                 step += 1
 
-        train_iterator.write(f'loss epoch {epoch + 1}: {loss_metric.result()}')
+        train_iterator.write(f"loss epoch {epoch + 1}: {loss_metric.result()}")
 
         loss_metric.reset_states()
 
@@ -322,13 +299,15 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l
 
 
 def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode):
-    eval_batch_size = args['per_device_eval_batch_size'] * args['n_device']
-    eval_dataset, size = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode)
+    eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"]
+    eval_dataset, size = load_and_cache_examples(
+        args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode
+    )
     eval_dataset = strategy.experimental_distribute_dataset(eval_dataset)
     preds = None
     num_eval_steps = math.ceil(size / eval_batch_size)
     master = master_bar(range(1))
-    eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args['n_device'] > 1)
+    eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args["n_device"] > 1)
     loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
     loss = 0.0
 
@@ -337,15 +316,17 @@ def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode)
     logging.info("  Batch size = %d", eval_batch_size)
 
     for eval_features, eval_labels in eval_iterator:
-        inputs = {'attention_mask': eval_features['input_mask'], 'training': False}
+        inputs = {"attention_mask": eval_features["input_mask"], "training": False}
 
-        if args['model_type'] != "distilbert":
-            inputs["token_type_ids"] = eval_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None
+        if args["model_type"] != "distilbert":
+            inputs["token_type_ids"] = (
+                eval_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None
+            )
 
         with strategy.scope():
-            logits = model(eval_features['input_ids'], **inputs)[0]
+            logits = model(eval_features["input_ids"], **inputs)[0]
             tmp_logits = tf.reshape(logits, (-1, len(labels) + 1))
-            active_loss = tf.reshape(eval_features['input_mask'], (-1,))
+            active_loss = tf.reshape(eval_features["input_mask"], (-1,))
             active_logits = tf.boolean_mask(tmp_logits, active_loss)
             tmp_eval_labels = tf.reshape(eval_labels, (-1,))
             active_labels = tf.boolean_mask(tmp_eval_labels, active_loss)
@@ -384,11 +365,11 @@ def load_cache(cached_file, max_seq_length):
     def _decode_record(record):
         example = tf.io.parse_single_example(record, name_to_features)
         features = {}
-        features['input_ids'] = example['input_ids']
-        features['input_mask'] = example['input_mask']
-        features['segment_ids'] = example['segment_ids']
+        features["input_ids"] = example["input_ids"]
+        features["input_mask"] = example["input_mask"]
+        features["segment_ids"] = example["segment_ids"]
 
-        return features, example['label_ids']
+        return features, example["label_ids"]
 
     d = tf.data.TFRecordDataset(cached_file)
     d = d.map(_decode_record, num_parallel_calls=4)
@@ -422,39 +403,46 @@ def save_cache(features, cached_features_file):
 
 
 def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_size, mode):
-    drop_remainder = True if args['tpu'] or mode == 'train' else False
+    drop_remainder = True if args["tpu"] or mode == "train" else False
 
     # Load data features from cache or dataset file
-    cached_features_file = os.path.join(args['data_dir'], "cached_{}_{}_{}.tf_record".format(mode,
-        list(filter(None, args['model_name_or_path'].split("/"))).pop(),
-        str(args['max_seq_length'])))
-    if os.path.exists(cached_features_file) and not args['overwrite_cache']:
+    cached_features_file = os.path.join(
+        args["data_dir"],
+        "cached_{}_{}_{}.tf_record".format(
+            mode, list(filter(None, args["model_name_or_path"].split("/"))).pop(), str(args["max_seq_length"])
+        ),
+    )
+    if os.path.exists(cached_features_file) and not args["overwrite_cache"]:
         logging.info("Loading features from cached file %s", cached_features_file)
-        dataset, size = load_cache(cached_features_file, args['max_seq_length'])
+        dataset, size = load_cache(cached_features_file, args["max_seq_length"])
     else:
-        logging.info("Creating features from dataset file at %s", args['data_dir'])
-        examples = read_examples_from_file(args['data_dir'], mode)
-        features = convert_examples_to_features(examples, labels, args['max_seq_length'], tokenizer,
-                                                cls_token_at_end=bool(args['model_type'] in ["xlnet"]),
-                                                # xlnet has a cls token at the end
-                                                cls_token=tokenizer.cls_token,
-                                                cls_token_segment_id=2 if args['model_type'] in ["xlnet"] else 0,
-                                                sep_token=tokenizer.sep_token,
-                                                sep_token_extra=bool(args['model_type'] in ["roberta"]),
-                                                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
-                                                pad_on_left=bool(args['model_type'] in ["xlnet"]),
-                                                # pad on the left for xlnet
-                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-                                                pad_token_segment_id=4 if args['model_type'] in ["xlnet"] else 0,
-                                                pad_token_label_id=pad_token_label_id
-                                                )
+        logging.info("Creating features from dataset file at %s", args["data_dir"])
+        examples = read_examples_from_file(args["data_dir"], mode)
+        features = convert_examples_to_features(
+            examples,
+            labels,
+            args["max_seq_length"],
+            tokenizer,
+            cls_token_at_end=bool(args["model_type"] in ["xlnet"]),
+            # xlnet has a cls token at the end
+            cls_token=tokenizer.cls_token,
+            cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0,
+            sep_token=tokenizer.sep_token,
+            sep_token_extra=bool(args["model_type"] in ["roberta"]),
+            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
+            pad_on_left=bool(args["model_type"] in ["xlnet"]),
+            # pad on the left for xlnet
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
+            pad_token_label_id=pad_token_label_id,
+        )
         logging.info("Saving features into cached file %s", cached_features_file)
         save_cache(features, cached_features_file)
-        dataset, size = load_cache(cached_features_file, args['max_seq_length'])
+        dataset, size = load_cache(cached_features_file, args["max_seq_length"])
 
-    if mode == 'train':
+    if mode == "train":
         dataset = dataset.repeat()
-        dataset = dataset.shuffle(buffer_size=8192, seed=args['seed'])
+        dataset = dataset.shuffle(buffer_size=8192, seed=args["seed"])
 
     dataset = dataset.batch(batch_size, drop_remainder)
     dataset = dataset.prefetch(buffer_size=batch_size)
@@ -466,98 +454,134 @@ def main(_):
     logging.set_verbosity(logging.INFO)
     args = flags.FLAGS.flag_values_dict()
 
-    if os.path.exists(args['output_dir']) and os.listdir(
-            args['output_dir']) and args['do_train'] and not args['overwrite_output_dir']:
+    if (
+        os.path.exists(args["output_dir"])
+        and os.listdir(args["output_dir"])
+        and args["do_train"]
+        and not args["overwrite_output_dir"]
+    ):
         raise ValueError(
             "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args['output_dir']))
+                args["output_dir"]
+            )
+        )
 
-    if args['fp16']:
+    if args["fp16"]:
         tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
 
-    if args['tpu']:
-        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args['tpu'])
+    if args["tpu"]:
+        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args["tpu"])
         tf.config.experimental_connect_to_cluster(resolver)
         tf.tpu.experimental.initialize_tpu_system(resolver)
         strategy = tf.distribute.experimental.TPUStrategy(resolver)
-        args['n_device'] = args['num_tpu_cores']
-    elif len(args['gpus'].split(',')) > 1:
-        args['n_device'] = len([f"/gpu:{gpu}" for gpu in args['gpus'].split(',')])
-        strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args['gpus'].split(',')])
-    elif args['no_cuda']:
-        args['n_device'] = 1
+        args["n_device"] = args["num_tpu_cores"]
+    elif len(args["gpus"].split(",")) > 1:
+        args["n_device"] = len([f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
+        strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
+    elif args["no_cuda"]:
+        args["n_device"] = 1
         strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
     else:
-        args['n_device'] = len(args['gpus'].split(','))
-        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args['gpus'].split(',')[0])
+        args["n_device"] = len(args["gpus"].split(","))
+        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args["gpus"].split(",")[0])
 
-    logging.warning("n_device: %s, distributed training: %s, 16-bits training: %s",
-                   args['n_device'], bool(args['n_device'] > 1), args['fp16'])
+    logging.warning(
+        "n_device: %s, distributed training: %s, 16-bits training: %s",
+        args["n_device"],
+        bool(args["n_device"] > 1),
+        args["fp16"],
+    )
 
-    labels = get_labels(args['labels'])
+    labels = get_labels(args["labels"])
     num_labels = len(labels) + 1
     pad_token_label_id = 0
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]
-    config = config_class.from_pretrained(args['config_name'] if args['config_name'] else args['model_name_or_path'],
-                                          num_labels=num_labels,
-                                          cache_dir=args['cache_dir'] if args['cache_dir'] else None)
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args["model_type"]]
+    config = config_class.from_pretrained(
+        args["config_name"] if args["config_name"] else args["model_name_or_path"],
+        num_labels=num_labels,
+        cache_dir=args["cache_dir"] if args["cache_dir"] else None,
+    )
 
     logging.info("Training/evaluation parameters %s", args)
 
     # Training
-    if args['do_train']:
-        tokenizer = tokenizer_class.from_pretrained(args['tokenizer_name'] if args['tokenizer_name'] else args['model_name_or_path'],
-                                                    do_lower_case=args['do_lower_case'],
-                                                    cache_dir=args['cache_dir'] if args['cache_dir'] else None)
+    if args["do_train"]:
+        tokenizer = tokenizer_class.from_pretrained(
+            args["tokenizer_name"] if args["tokenizer_name"] else args["model_name_or_path"],
+            do_lower_case=args["do_lower_case"],
+            cache_dir=args["cache_dir"] if args["cache_dir"] else None,
+        )
 
         with strategy.scope():
-            model = model_class.from_pretrained(args['model_name_or_path'],
-                                                from_pt=bool(".bin" in args['model_name_or_path']),
-                                                config=config,
-                                                cache_dir=args['cache_dir'] if args['cache_dir'] else None)
+            model = model_class.from_pretrained(
+                args["model_name_or_path"],
+                from_pt=bool(".bin" in args["model_name_or_path"]),
+                config=config,
+                cache_dir=args["cache_dir"] if args["cache_dir"] else None,
+            )
             model.layers[-1].activation = tf.keras.activations.softmax
 
-        train_batch_size = args['per_device_train_batch_size'] * args['n_device']
-        train_dataset, num_train_examples = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train")
+        train_batch_size = args["per_device_train_batch_size"] * args["n_device"]
+        train_dataset, num_train_examples = load_and_cache_examples(
+            args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train"
+        )
         train_dataset = strategy.experimental_distribute_dataset(train_dataset)
-        train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id)
+        train(
+            args,
+            strategy,
+            train_dataset,
+            tokenizer,
+            model,
+            num_train_examples,
+            labels,
+            train_batch_size,
+            pad_token_label_id,
+        )
 
-        if not os.path.exists(args['output_dir']):
-            os.makedirs(args['output_dir'])
+        if not os.path.exists(args["output_dir"]):
+            os.makedirs(args["output_dir"])
 
-        logging.info("Saving model to %s", args['output_dir'])
+        logging.info("Saving model to %s", args["output_dir"])
 
-        model.save_pretrained(args['output_dir'])
-        tokenizer.save_pretrained(args['output_dir'])
+        model.save_pretrained(args["output_dir"])
+        tokenizer.save_pretrained(args["output_dir"])
 
     # Evaluation
-    if args['do_eval']:
-        tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
+    if args["do_eval"]:
+        tokenizer = tokenizer_class.from_pretrained(args["output_dir"], do_lower_case=args["do_lower_case"])
         checkpoints = []
         results = []
 
-        if args['eval_all_checkpoints']:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args['output_dir'] + "/**/" + TF2_WEIGHTS_NAME, recursive=True), key=lambda f: int(''.join(filter(str.isdigit, f)) or -1)))
-        
+        if args["eval_all_checkpoints"]:
+            checkpoints = list(
+                os.path.dirname(c)
+                for c in sorted(
+                    glob.glob(args["output_dir"] + "/**/" + TF2_WEIGHTS_NAME, recursive=True),
+                    key=lambda f: int("".join(filter(str.isdigit, f)) or -1),
+                )
+            )
+
         logging.info("Evaluate the following checkpoints: %s", checkpoints)
 
         if len(checkpoints) == 0:
-            checkpoints.append(args['output_dir'])
-        
+            checkpoints.append(args["output_dir"])
+
         for checkpoint in checkpoints:
             global_step = checkpoint.split("-")[-1] if re.match(".*checkpoint-[0-9]", checkpoint) else "final"
 
             with strategy.scope():
                 model = model_class.from_pretrained(checkpoint)
 
-            y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
+            y_true, y_pred, eval_loss = evaluate(
+                args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev"
+            )
             report = metrics.classification_report(y_true, y_pred, digits=4)
 
             if global_step:
                 results.append({global_step + "_report": report, global_step + "_loss": eval_loss})
 
-        output_eval_file = os.path.join(args['output_dir'], "eval_results.txt")
-        
+        output_eval_file = os.path.join(args["output_dir"], "eval_results.txt")
+
         with tf.io.gfile.GFile(output_eval_file, "w") as writer:
             for res in results:
                 for key, val in res.items():
@@ -572,26 +596,28 @@ def main(_):
                         writer.write(report)
                         writer.write("\n")
 
-    if args['do_predict']:
-        tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
-        model = model_class.from_pretrained(args['output_dir'])
-        eval_batch_size = args['per_device_eval_batch_size'] * args['n_device']
-        predict_dataset, _ = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test")
+    if args["do_predict"]:
+        tokenizer = tokenizer_class.from_pretrained(args["output_dir"], do_lower_case=args["do_lower_case"])
+        model = model_class.from_pretrained(args["output_dir"])
+        eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"]
+        predict_dataset, _ = load_and_cache_examples(
+            args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test"
+        )
         y_true, y_pred, pred_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="test")
-        output_test_results_file = os.path.join(args['output_dir'], "test_results.txt")
-        output_test_predictions_file = os.path.join(args['output_dir'], "test_predictions.txt")
+        output_test_results_file = os.path.join(args["output_dir"], "test_results.txt")
+        output_test_predictions_file = os.path.join(args["output_dir"], "test_predictions.txt")
         report = metrics.classification_report(y_true, y_pred, digits=4)
 
         with tf.io.gfile.GFile(output_test_results_file, "w") as writer:
             report = metrics.classification_report(y_true, y_pred, digits=4)
-            
+
             logging.info("\n" + report)
-            
+
             writer.write(report)
             writer.write("\n\nloss = " + str(pred_loss))
 
         with tf.io.gfile.GFile(output_test_predictions_file, "w") as writer:
-            with tf.io.gfile.GFile(os.path.join(args['data_dir'], "test.txt"), "r") as f:
+            with tf.io.gfile.GFile(os.path.join(args["data_dir"], "test.txt"), "r") as f:
                 example_id = 0
 
                 for line in f:
diff --git a/examples/run_xnli.py b/examples/run_xnli.py
index 74bf295b69..9faba294dd 100644
--- a/examples/run_xnli.py
+++ b/examples/run_xnli.py
@@ -26,8 +26,7 @@ import random
 
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 
 try:
@@ -37,10 +36,18 @@ except:
 
 from tqdm import tqdm, trange
 
-from transformers import (WEIGHTS_NAME, 
-                          BertConfig, BertForSequenceClassification, BertTokenizer,
-                          XLMConfig, XLMForSequenceClassification, XLMTokenizer,
-                          DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForSequenceClassification,
+    BertTokenizer,
+    XLMConfig,
+    XLMForSequenceClassification,
+    XLMTokenizer,
+    DistilBertConfig,
+    DistilBertForSequenceClassification,
+    DistilBertTokenizer,
+)
 
 from transformers import AdamW, get_linear_schedule_with_warmup
 
@@ -52,12 +59,14 @@ from transformers import glue_convert_examples_to_features as convert_examples_t
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, DistilBertConfig, XLMConfig)), ())
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, DistilBertConfig, XLMConfig)), ()
+)
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
-    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
+    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
+    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
 }
 
 
@@ -85,19 +94,26 @@ def train(args, train_dataset, model, tokenizer):
         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
 
     # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
         # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
 
     if args.fp16:
         try:
@@ -112,17 +128,21 @@ def train(args, train_dataset, model, tokenizer):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -132,7 +152,7 @@ def train(args, train_dataset, model, tokenizer):
     # Check if continuing training from a checkpoint
     if os.path.exists(args.model_name_or_path):
         # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
+        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
         epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
         steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
 
@@ -143,7 +163,9 @@ def train(args, train_dataset, model, tokenizer):
 
     tr_loss, logging_loss = 0.0, 0.0
     model.zero_grad()
-    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
     set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
@@ -155,16 +177,16 @@ def train(args, train_dataset, model, tokenizer):
 
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      'labels':         batch[3]}
-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None  # XLM and DistilBERT don't use segment_ids
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = (
+                    batch[2] if args.model_type in ["bert"] else None
+                )  # XLM and DistilBERT don't use segment_ids
             outputs = model(**inputs)
             loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
             if args.gradient_accumulation_steps > 1:
                 loss = loss / args.gradient_accumulation_steps
 
@@ -188,28 +210,32 @@ def train(args, train_dataset, model, tokenizer):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                     logging_loss = tr_loss
 
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                     # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
                     tokenizer.save_pretrained(output_dir)
 
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                     logger.info("Saving optimizer and scheduler states to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
@@ -258,11 +284,11 @@ def evaluate(args, model, tokenizer, prefix=""):
             batch = tuple(t.to(args.device) for t in batch)
 
             with torch.no_grad():
-                inputs = {'input_ids':      batch[0],
-                          'attention_mask': batch[1],
-                          'labels':         batch[3]}
-                if args.model_type != 'distilbert':
-                    inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None  # XLM and DistilBERT don't use segment_ids
+                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+                if args.model_type != "distilbert":
+                    inputs["token_type_ids"] = (
+                        batch[2] if args.model_type in ["bert"] else None
+                    )  # XLM and DistilBERT don't use segment_ids
                 outputs = model(**inputs)
                 tmp_eval_loss, logits = outputs[:2]
 
@@ -270,16 +296,16 @@ def evaluate(args, model, tokenizer, prefix=""):
             nb_eval_steps += 1
             if preds is None:
                 preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs['labels'].detach().cpu().numpy()
+                out_label_ids = inputs["labels"].detach().cpu().numpy()
             else:
                 preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
 
         eval_loss = eval_loss / nb_eval_steps
         if args.output_mode == "classification":
             preds = np.argmax(preds, axis=1)
         else:
-            raise ValueError('No other `output_mode` for XNLI.')
+            raise ValueError("No other `output_mode` for XNLI.")
         result = compute_metrics(eval_task, preds, out_label_ids)
         results.update(result)
 
@@ -300,27 +326,34 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
     processor = processors[task](language=args.language, train_language=args.train_language)
     output_mode = output_modes[task]
     # Load data features from cache or dataset file
-    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}_{}'.format(
-        'test' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length),
-        str(task),
-        str(args.train_language if (not evaluate and args.train_language is not None) else args.language)))
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}_{}_{}".format(
+            "test" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+            str(task),
+            str(args.train_language if (not evaluate and args.train_language is not None) else args.language),
+        ),
+    )
     if os.path.exists(cached_features_file) and not args.overwrite_cache:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", args.data_dir)
         label_list = processor.get_labels()
-        examples = processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-        features = convert_examples_to_features(examples,
-                                                tokenizer,
-                                                label_list=label_list,
-                                                max_length=args.max_seq_length,
-                                                output_mode=output_mode,
-                                                pad_on_left=False,
-                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-                                                pad_token_segment_id=0,
+        examples = (
+            processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+        )
+        features = convert_examples_to_features(
+            examples,
+            tokenizer,
+            label_list=label_list,
+            max_length=args.max_seq_length,
+            output_mode=output_mode,
+            pad_on_left=False,
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=0,
         )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
@@ -336,7 +369,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
     if output_mode == "classification":
         all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
     else:
-        raise ValueError('No other `output_mode` for XNLI.')
+        raise ValueError("No other `output_mode` for XNLI.")
 
     dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
     return dataset
@@ -346,92 +379,152 @@ def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--language", default=None, type=str, required=True,
-                        help="Evaluation language. Also train language if `train_language` is set to None.")
-    parser.add_argument("--train_language", default=None, type=str,
-                        help="Train language if is different of the evaluation language.")
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--language",
+        default=None,
+        type=str,
+        required=True,
+        help="Evaluation language. Also train language if `train_language` is set to None.",
+    )
+    parser.add_argument(
+        "--train_language", default=None, type=str, help="Train language if is different of the evaluation language."
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
 
     ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the test set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the test set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
 
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
 
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
 
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
     args = parser.parse_args()
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -443,22 +536,30 @@ def main():
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
         args.n_gpu = 1
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
 
     # Prepare XNLI task
-    args.task_name = 'xnli'
+    args.task_name = "xnli"
     if args.task_name not in processors:
         raise ValueError("Task not found: %s" % (args.task_name))
     processor = processors[args.task_name](language=args.language, train_language=args.train_language)
@@ -472,17 +573,23 @@ def main():
 
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels,
-                                          finetuning_task=args.task_name,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -491,14 +598,12 @@ def main():
 
     logger.info("Training/evaluation parameters %s", args)
 
-
     # Training
     if args.do_train:
         train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
         global_step, tr_loss = train(args, train_dataset, model, tokenizer)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
@@ -508,36 +613,39 @@ def main():
         logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
         tokenizer = tokenizer_class.from_pretrained(args.output_dir)
         model.to(args.device)
 
-
     # Evaluation
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
             result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
             results.update(result)
 
     return results
diff --git a/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py b/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
index 33b17bfb6f..d32e6fc06c 100644
--- a/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
+++ b/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
@@ -34,12 +34,30 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
-SAMPLE_TEXT = 'Hello world! cécé herlolip'
+SAMPLE_TEXT = "Hello world! cécé herlolip"
 
 
 BertAbsConfig = namedtuple(
     "BertAbsConfig",
-    ["temp_dir", "large", "use_bert_emb", "finetune_bert", "encoder", "share_emb", "max_pos", "enc_layers", "enc_hidden_size", "enc_heads", "enc_ff_size", "enc_dropout", "dec_layers", "dec_hidden_size", "dec_heads", "dec_ff_size", "dec_dropout"],
+    [
+        "temp_dir",
+        "large",
+        "use_bert_emb",
+        "finetune_bert",
+        "encoder",
+        "share_emb",
+        "max_pos",
+        "enc_layers",
+        "enc_hidden_size",
+        "enc_heads",
+        "enc_ff_size",
+        "enc_dropout",
+        "dec_layers",
+        "dec_hidden_size",
+        "dec_heads",
+        "dec_ff_size",
+        "dec_dropout",
+    ],
 )
 
 
@@ -119,7 +137,9 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
     output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls)[0]
     output_original_generator = original.generator(output_original_model)
 
-    output_converted_model = new_model(encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask)[0]
+    output_converted_model = new_model(
+        encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask
+    )[0]
     output_converted_generator = new_model.generator(output_converted_model)
 
     maximum_absolute_difference = torch.max(torch.abs(output_converted_model - output_original_model)).item()
@@ -136,28 +156,21 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
     # The model has been saved with torch.save(model) and this is bound to the exact
     # directory structure. We save the state_dict instead.
     logging.info("saving the model's state dictionary")
-    torch.save(new_model.state_dict(), "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin")
+    torch.save(
+        new_model.state_dict(), "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin"
+    )
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--bertabs_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path the official PyTorch dump.",
+        "--bertabs_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump.",
     )
     parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the output PyTorch model.",
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model.",
     )
     args = parser.parse_args()
 
     convert_bertabs_checkpoints(
-        args.bertabs_checkpoint_path,
-        args.pytorch_dump_folder_path,
+        args.bertabs_checkpoint_path, args.pytorch_dump_folder_path,
     )
diff --git a/examples/summarization/modeling_bertabs.py b/examples/summarization/modeling_bertabs.py
index 5bf1599ad2..d4d8c6648d 100644
--- a/examples/summarization/modeling_bertabs.py
+++ b/examples/summarization/modeling_bertabs.py
@@ -56,40 +56,22 @@ class BertAbs(BertAbsPreTrainedModel):
         load_bert_pretrained_extractive = True if bert_extractive_checkpoint else False
         if load_bert_pretrained_extractive:
             self.bert.model.load_state_dict(
-                dict(
-                    [
-                        (n[11:], p)
-                        for n, p in bert_extractive_checkpoint.items()
-                        if n.startswith("bert.model")
-                    ]
-                ),
+                dict([(n[11:], p) for n, p in bert_extractive_checkpoint.items() if n.startswith("bert.model")]),
                 strict=True,
             )
 
         self.vocab_size = self.bert.model.config.vocab_size
 
         if args.max_pos > 512:
-            my_pos_embeddings = nn.Embedding(
-                args.max_pos, self.bert.model.config.hidden_size
-            )
-            my_pos_embeddings.weight.data[
-                :512
-            ] = self.bert.model.embeddings.position_embeddings.weight.data
-            my_pos_embeddings.weight.data[
-                512:
-            ] = self.bert.model.embeddings.position_embeddings.weight.data[-1][
+            my_pos_embeddings = nn.Embedding(args.max_pos, self.bert.model.config.hidden_size)
+            my_pos_embeddings.weight.data[:512] = self.bert.model.embeddings.position_embeddings.weight.data
+            my_pos_embeddings.weight.data[512:] = self.bert.model.embeddings.position_embeddings.weight.data[-1][
                 None, :
-            ].repeat(
-                args.max_pos - 512, 1
-            )
+            ].repeat(args.max_pos - 512, 1)
             self.bert.model.embeddings.position_embeddings = my_pos_embeddings
-        tgt_embeddings = nn.Embedding(
-            self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0
-        )
+        tgt_embeddings = nn.Embedding(self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0)
 
-        tgt_embeddings.weight = copy.deepcopy(
-            self.bert.model.embeddings.word_embeddings.weight
-        )
+        tgt_embeddings.weight = copy.deepcopy(self.bert.model.embeddings.word_embeddings.weight)
 
         self.decoder = TransformerDecoder(
             self.args.dec_layers,
@@ -102,9 +84,7 @@ class BertAbs(BertAbsPreTrainedModel):
         )
 
         gen_func = nn.LogSoftmax(dim=-1)
-        self.generator = nn.Sequential(
-            nn.Linear(args.dec_hidden_size, args.vocab_size), gen_func
-        )
+        self.generator = nn.Sequential(nn.Linear(args.dec_hidden_size, args.vocab_size), gen_func)
         self.generator[0].weight = self.decoder.embeddings.weight
 
         load_from_checkpoints = False if checkpoint is None else True
@@ -127,25 +107,14 @@ class BertAbs(BertAbsPreTrainedModel):
                 p.data.zero_()
 
     def forward(
-        self,
-        encoder_input_ids,
-        decoder_input_ids,
-        token_type_ids,
-        encoder_attention_mask,
-        decoder_attention_mask,
+        self, encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask,
     ):
         encoder_output = self.bert(
-            input_ids=encoder_input_ids,
-            token_type_ids=token_type_ids,
-            attention_mask=encoder_attention_mask,
+            input_ids=encoder_input_ids, token_type_ids=token_type_ids, attention_mask=encoder_attention_mask,
         )
         encoder_hidden_states = encoder_output[0]
-        dec_state = self.decoder.init_decoder_state(
-            encoder_input_ids, encoder_hidden_states
-        )
-        decoder_outputs, _ = self.decoder(
-            decoder_input_ids[:, :-1], encoder_hidden_states, dec_state
-        )
+        dec_state = self.decoder.init_decoder_state(encoder_input_ids, encoder_hidden_states)
+        decoder_outputs, _ = self.decoder(decoder_input_ids[:, :-1], encoder_hidden_states, dec_state)
         return decoder_outputs
 
 
@@ -162,10 +131,7 @@ class Bert(nn.Module):
         self.eval()
         with torch.no_grad():
             encoder_outputs, _ = self.model(
-                input_ids,
-                token_type_ids=token_type_ids,
-                attention_mask=attention_mask,
-                **kwargs
+                input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, **kwargs
             )
         return encoder_outputs
 
@@ -196,10 +162,7 @@ class TransformerDecoder(nn.Module):
 
         # Build TransformerDecoder.
         self.transformer_layers = nn.ModuleList(
-            [
-                TransformerDecoderLayer(d_model, heads, d_ff, dropout)
-                for _ in range(num_layers)
-            ]
+            [TransformerDecoderLayer(d_model, heads, d_ff, dropout) for _ in range(num_layers)]
         )
 
         self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
@@ -236,20 +199,14 @@ class TransformerDecoder(nn.Module):
         # Decoder padding mask
         tgt_words = tgt
         tgt_batch, tgt_len = tgt_words.size()
-        tgt_pad_mask = (
-            tgt_words.data.eq(padding_idx).unsqueeze(1).expand(tgt_batch, tgt_len, tgt_len)
-        )
+        tgt_pad_mask = tgt_words.data.eq(padding_idx).unsqueeze(1).expand(tgt_batch, tgt_len, tgt_len)
 
         # Encoder padding mask
         if memory_mask is not None:
             src_len = memory_mask.size(-1)
             src_pad_mask = memory_mask.expand(src_batch, tgt_len, src_len)
         else:
-            src_pad_mask = (
-                src_words.data.eq(padding_idx)
-                .unsqueeze(1)
-                .expand(src_batch, tgt_len, src_len)
-            )
+            src_pad_mask = src_words.data.eq(padding_idx).unsqueeze(1).expand(src_batch, tgt_len, src_len)
 
         # Pass through the embeddings
         emb = self.embeddings(input_ids)
@@ -271,9 +228,7 @@ class TransformerDecoder(nn.Module):
                 src_pad_mask,
                 tgt_pad_mask,
                 previous_input=prev_layer_input,
-                layer_cache=state.cache["layer_{}".format(i)]
-                if state.cache is not None
-                else None,
+                layer_cache=state.cache["layer_{}".format(i)] if state.cache is not None else None,
                 step=step,
             )
             if state.cache is None:
@@ -303,9 +258,7 @@ class PositionalEncoding(nn.Module):
     def __init__(self, dropout, dim, max_len=5000):
         pe = torch.zeros(max_len, dim)
         position = torch.arange(0, max_len).unsqueeze(1)
-        div_term = torch.exp(
-            (torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim))
-        )
+        div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim)))
         pe[:, 0::2] = torch.sin(position.float() * div_term)
         pe[:, 1::2] = torch.cos(position.float() * div_term)
         pe = pe.unsqueeze(0)
@@ -356,14 +309,7 @@ class TransformerDecoderLayer(nn.Module):
         self.register_buffer("mask", mask)
 
     def forward(
-        self,
-        inputs,
-        memory_bank,
-        src_pad_mask,
-        tgt_pad_mask,
-        previous_input=None,
-        layer_cache=None,
-        step=None,
+        self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, previous_input=None, layer_cache=None, step=None,
     ):
         """
         Args:
@@ -380,34 +326,20 @@ class TransformerDecoderLayer(nn.Module):
             * all_input `[batch_size x current_step x model_dim]`
 
         """
-        dec_mask = torch.gt(
-            tgt_pad_mask + self.mask[:, : tgt_pad_mask.size(1), : tgt_pad_mask.size(1)], 0
-        )
+        dec_mask = torch.gt(tgt_pad_mask + self.mask[:, : tgt_pad_mask.size(1), : tgt_pad_mask.size(1)], 0)
         input_norm = self.layer_norm_1(inputs)
         all_input = input_norm
         if previous_input is not None:
             all_input = torch.cat((previous_input, input_norm), dim=1)
             dec_mask = None
 
-        query = self.self_attn(
-            all_input,
-            all_input,
-            input_norm,
-            mask=dec_mask,
-            layer_cache=layer_cache,
-            type="self",
-        )
+        query = self.self_attn(all_input, all_input, input_norm, mask=dec_mask, layer_cache=layer_cache, type="self",)
 
         query = self.drop(query) + inputs
 
         query_norm = self.layer_norm_2(query)
         mid = self.context_attn(
-            memory_bank,
-            memory_bank,
-            query_norm,
-            mask=src_pad_mask,
-            layer_cache=layer_cache,
-            type="context",
+            memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, type="context",
         )
         output = self.feed_forward(self.drop(mid) + query)
 
@@ -492,14 +424,7 @@ class MultiHeadedAttention(nn.Module):
             self.final_linear = nn.Linear(model_dim, model_dim)
 
     def forward(
-        self,
-        key,
-        value,
-        query,
-        mask=None,
-        layer_cache=None,
-        type=None,
-        predefined_graph_1=None,
+        self, key, value, query, mask=None, layer_cache=None, type=None, predefined_graph_1=None,
     ):
         """
         Compute the context vector and the attention vectors.
@@ -531,11 +456,7 @@ class MultiHeadedAttention(nn.Module):
 
         def unshape(x):
             """  compute context """
-            return (
-                x.transpose(1, 2)
-                .contiguous()
-                .view(batch_size, -1, head_count * dim_per_head)
-            )
+            return x.transpose(1, 2).contiguous().view(batch_size, -1, head_count * dim_per_head)
 
         # 1) Project key, value, and query.
         if layer_cache is not None:
@@ -554,9 +475,7 @@ class MultiHeadedAttention(nn.Module):
                     if layer_cache["self_keys"] is not None:
                         key = torch.cat((layer_cache["self_keys"].to(device), key), dim=2)
                     if layer_cache["self_values"] is not None:
-                        value = torch.cat(
-                            (layer_cache["self_values"].to(device), value), dim=2
-                        )
+                        value = torch.cat((layer_cache["self_values"].to(device), value), dim=2)
                     layer_cache["self_keys"] = key
                     layer_cache["self_values"] = value
             elif type == "context":
@@ -637,13 +556,9 @@ class DecoderState(object):
             sizes = e.size()
             br = sizes[1]
             if len(sizes) == 3:
-                sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2])[
-                    :, :, idx
-                ]
+                sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2])[:, :, idx]
             else:
-                sent_states = e.view(
-                    sizes[0], beam_size, br // beam_size, sizes[2], sizes[3]
-                )[:, :, idx]
+                sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2], sizes[3])[:, :, idx]
 
             sent_states.data.copy_(sent_states.data.index_select(1, positions))
 
@@ -716,11 +631,7 @@ class TransformerDecoderState(DecoderState):
 
 
 def gelu(x):
-    return (
-        0.5
-        * x
-        * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-    )
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 
 
 class PositionwiseFeedForward(nn.Module):
@@ -758,9 +669,7 @@ class PositionwiseFeedForward(nn.Module):
 def build_predictor(args, tokenizer, symbols, model, logger=None):
     # we should be able to refactor the global scorer a lot
     scorer = GNMTGlobalScorer(args.alpha, length_penalty="wu")
-    translator = Translator(
-        args, model, tokenizer, symbols, global_scorer=scorer, logger=logger
-    )
+    translator = Translator(args, model, tokenizer, symbols, global_scorer=scorer, logger=logger)
     return translator
 
 
@@ -891,9 +800,7 @@ class Translator(object):
            Shouldn't need the original dataset.
         """
         with torch.no_grad():
-            return self._fast_translate_batch(
-                batch, self.max_length, min_length=self.min_length
-            )
+            return self._fast_translate_batch(batch, self.max_length, min_length=self.min_length)
 
     # Where the beam search lives
     # I have no idea why it is being called from the method above
@@ -912,26 +819,18 @@ class Translator(object):
         mask_src = batch.mask_src
 
         src_features = self.model.bert(src, segs, mask_src)
-        dec_states = self.model.decoder.init_decoder_state(
-            src, src_features, with_cache=True
-        )
+        dec_states = self.model.decoder.init_decoder_state(src, src_features, with_cache=True)
         device = src_features.device
 
         # Tile states and memory beam_size times.
         dec_states.map_batch_fn(lambda state, dim: tile(state, beam_size, dim=dim))
         src_features = tile(src_features, beam_size, dim=0)
         batch_offset = torch.arange(batch_size, dtype=torch.long, device=device)
-        beam_offset = torch.arange(
-            0, batch_size * beam_size, step=beam_size, dtype=torch.long, device=device
-        )
-        alive_seq = torch.full(
-            [batch_size * beam_size, 1], self.start_token, dtype=torch.long, device=device
-        )
+        beam_offset = torch.arange(0, batch_size * beam_size, step=beam_size, dtype=torch.long, device=device)
+        alive_seq = torch.full([batch_size * beam_size, 1], self.start_token, dtype=torch.long, device=device)
 
         # Give full probability to the first beam on the first step.
-        topk_log_probs = torch.tensor(
-            [0.0] + [float("-inf")] * (beam_size - 1), device=device
-        ).repeat(batch_size)
+        topk_log_probs = torch.tensor([0.0] + [float("-inf")] * (beam_size - 1), device=device).repeat(batch_size)
 
         # Structure that holds finished hypotheses.
         hypotheses = [[] for _ in range(batch_size)]  # noqa: F812
@@ -948,9 +847,7 @@ class Translator(object):
             # Decoder forward.
             decoder_input = decoder_input.transpose(0, 1)
 
-            dec_out, dec_states = self.model.decoder(
-                decoder_input, src_features, dec_states, step=step
-            )
+            dec_out, dec_states = self.model.decoder(decoder_input, src_features, dec_states, step=step)
 
             # Generator forward.
             log_probs = self.generator.forward(dec_out.transpose(0, 1).squeeze(0))
@@ -978,10 +875,7 @@ class Translator(object):
                         words = " ".join(words).replace(" ##", "").split()
                         if len(words) <= 3:
                             continue
-                        trigrams = [
-                            (words[i - 1], words[i], words[i + 1])
-                            for i in range(1, len(words) - 1)
-                        ]
+                        trigrams = [(words[i - 1], words[i], words[i + 1]) for i in range(1, len(words) - 1)]
                         trigram = tuple(trigrams[-1])
                         if trigram in trigrams[:-1]:
                             fail = True
@@ -999,15 +893,11 @@ class Translator(object):
             topk_ids = topk_ids.fmod(vocab_size)
 
             # Map beam_index to batch_index in the flat representation.
-            batch_index = topk_beam_index + beam_offset[
-                : topk_beam_index.size(0)
-            ].unsqueeze(1)
+            batch_index = topk_beam_index + beam_offset[: topk_beam_index.size(0)].unsqueeze(1)
             select_indices = batch_index.view(-1)
 
             # Append last prediction.
-            alive_seq = torch.cat(
-                [alive_seq.index_select(0, select_indices), topk_ids.view(-1, 1)], -1
-            )
+            alive_seq = torch.cat([alive_seq.index_select(0, select_indices), topk_ids.view(-1, 1)], -1)
 
             is_finished = topk_ids.eq(self.end_token)
             if step + 1 == max_length:
@@ -1040,15 +930,11 @@ class Translator(object):
                 topk_log_probs = topk_log_probs.index_select(0, non_finished)
                 batch_index = batch_index.index_select(0, non_finished)
                 batch_offset = batch_offset.index_select(0, non_finished)
-                alive_seq = predictions.index_select(0, non_finished).view(
-                    -1, alive_seq.size(-1)
-                )
+                alive_seq = predictions.index_select(0, non_finished).view(-1, alive_seq.size(-1))
             # Reorder states.
             select_indices = batch_index.view(-1)
             src_features = src_features.index_select(0, select_indices)
-            dec_states.map_batch_fn(
-                lambda state, dim: state.index_select(dim, select_indices)
-            )
+            dec_states.map_batch_fn(lambda state, dim: state.index_select(dim, select_indices))
 
         return results
 
@@ -1089,14 +975,7 @@ def tile(x, count, dim=0):
     out_size = list(x.size())
     out_size[0] *= count
     batch = x.size(0)
-    x = (
-        x.view(batch, -1)
-        .transpose(0, 1)
-        .repeat(count, 1)
-        .transpose(0, 1)
-        .contiguous()
-        .view(*out_size)
-    )
+    x = x.view(batch, -1).transpose(0, 1).repeat(count, 1).transpose(0, 1).contiguous().view(*out_size)
     if dim != 0:
         x = x.permute(perm).contiguous()
     return x
@@ -1107,6 +986,7 @@ def tile(x, count, dim=0):
 # a finetuning script.
 #
 
+
 class BertSumOptimizer(object):
     """ Specific optimizer for BertSum.
 
@@ -1126,16 +1006,10 @@ class BertSumOptimizer(object):
 
         self.optimizers = {
             "encoder": torch.optim.Adam(
-                model.encoder.parameters(),
-                lr=lr["encoder"],
-                betas=(beta_1, beta_2),
-                eps=eps,
+                model.encoder.parameters(), lr=lr["encoder"], betas=(beta_1, beta_2), eps=eps,
             ),
             "decoder": torch.optim.Adam(
-                model.decoder.parameters(),
-                lr=lr["decoder"],
-                betas=(beta_1, beta_2),
-                eps=eps,
+                model.decoder.parameters(), lr=lr["decoder"], betas=(beta_1, beta_2), eps=eps,
             ),
         }
 
@@ -1143,9 +1017,7 @@ class BertSumOptimizer(object):
         self.current_learning_rates = {}
 
     def _update_rate(self, stack):
-        return self.lr[stack] * min(
-            self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-1.5)
-        )
+        return self.lr[stack] * min(self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-1.5))
 
     def zero_grad(self):
         self.optimizer_decoder.zero_grad()
diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
index 3c339d0c30..36210d999d 100644
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -25,9 +25,7 @@ logger = logging.getLogger(__name__)
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 
 
-Batch = namedtuple(
-    "Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"]
-)
+Batch = namedtuple("Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"])
 
 
 def evaluate(args):
@@ -48,13 +46,14 @@ def evaluate(args):
 
         import rouge
         import nltk
-        nltk.download('punkt')
+
+        nltk.download("punkt")
         rouge_evaluator = rouge.Rouge(
-            metrics=['rouge-n', 'rouge-l'],
+            metrics=["rouge-n", "rouge-l"],
             max_n=2,
             limit_length=True,
             length_limit=args.beam_size,
-            length_limit_type='words',
+            length_limit_type="words",
             apply_avg=True,
             apply_best=False,
             alpha=0.5,  # Default F1_score
@@ -161,15 +160,15 @@ Recall    >> {:.3f}
 F1        >> {:.3f}
 Precision >> {:.3f}
 Recall    >> {:.3f}""".format(
-        scores['rouge-1']['f'],
-        scores['rouge-1']['p'],
-        scores['rouge-1']['r'],
-        scores['rouge-2']['f'],
-        scores['rouge-2']['p'],
-        scores['rouge-2']['r'],
-        scores['rouge-l']['f'],
-        scores['rouge-l']['p'],
-        scores['rouge-l']['r'],
+        scores["rouge-1"]["f"],
+        scores["rouge-1"]["p"],
+        scores["rouge-1"]["r"],
+        scores["rouge-2"]["f"],
+        scores["rouge-2"]["p"],
+        scores["rouge-2"]["r"],
+        scores["rouge-l"]["f"],
+        scores["rouge-l"]["p"],
+        scores["rouge-l"]["r"],
     )
 
 
@@ -187,9 +186,7 @@ def build_data_iterator(args, tokenizer):
     dataset = load_and_cache_examples(args, tokenizer)
     sampler = SequentialSampler(dataset)
     collate_fn = lambda data: collate(data, tokenizer, block_size=512, device=args.device)
-    iterator = DataLoader(
-        dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,
-    )
+    iterator = DataLoader(dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,)
 
     return iterator
 
@@ -210,14 +207,9 @@ def collate(data, tokenizer, block_size, device):
     names = [name for name, _, _ in data]
     summaries = [" ".join(summary_list) for _, _, summary_list in data]
 
-    encoded_text = [
-        encode_for_summarization(story, summary, tokenizer) for _, story, summary in data
-    ]
+    encoded_text = [encode_for_summarization(story, summary, tokenizer) for _, story, summary in data]
     encoded_stories = torch.tensor(
-        [
-            fit_to_block_size(story, block_size, tokenizer.pad_token_id)
-            for story, _ in encoded_text
-        ]
+        [fit_to_block_size(story, block_size, tokenizer.pad_token_id) for story, _ in encoded_text]
     )
     encoder_token_type_ids = compute_token_type_ids(encoded_stories, tokenizer.cls_token_id)
     encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id)
@@ -272,38 +264,23 @@ def main():
     )
     # EVALUATION options
     parser.add_argument(
-        "--no_cuda",
-        default=False,
-        type=bool,
-        help="Whether to force the execution on CPU.",
+        "--no_cuda", default=False, type=bool, help="Whether to force the execution on CPU.",
     )
     parser.add_argument(
         "--batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.",
     )
     # BEAM SEARCH arguments
     parser.add_argument(
-        "--min_length",
-        default=50,
-        type=int,
-        help="Minimum number of tokens for the summaries.",
+        "--min_length", default=50, type=int, help="Minimum number of tokens for the summaries.",
     )
     parser.add_argument(
-        "--max_length",
-        default=200,
-        type=int,
-        help="Maixmum number of tokens for the summaries.",
+        "--max_length", default=200, type=int, help="Maixmum number of tokens for the summaries.",
     )
     parser.add_argument(
-        "--beam_size",
-        default=5,
-        type=int,
-        help="The number of beams to start with for each example.",
+        "--beam_size", default=5, type=int, help="The number of beams to start with for each example.",
     )
     parser.add_argument(
-        "--alpha",
-        default=0.95,
-        type=float,
-        help="The value of alpha for the length penalty in the beam search.",
+        "--alpha", default=0.95, type=float, help="The value of alpha for the length penalty in the beam search.",
     )
     parser.add_argument(
         "--block_trigram",
diff --git a/examples/summarization/utils_summarization.py b/examples/summarization/utils_summarization.py
index 1d8c436ac9..96470f47a2 100644
--- a/examples/summarization/utils_summarization.py
+++ b/examples/summarization/utils_summarization.py
@@ -68,9 +68,7 @@ def process_story(raw_story):
     Raises:
         IndexError: If the stoy is empty or contains no highlights.
     """
-    nonempty_lines = list(
-        filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")])
-    )
+    nonempty_lines = list(filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")]))
 
     # for some unknown reason some lines miss a period, add it
     nonempty_lines = [_add_missing_period(line) for line in nonempty_lines]
@@ -135,13 +133,9 @@ def encode_for_summarization(story_lines, summary_lines, tokenizer):
     sentences.
     """
     story_lines_token_ids = [tokenizer.encode(line) for line in story_lines]
-    story_token_ids = [
-        token for sentence in story_lines_token_ids for token in sentence
-    ]
+    story_token_ids = [token for sentence in story_lines_token_ids for token in sentence]
     summary_lines_token_ids = [tokenizer.encode(line) for line in summary_lines]
-    summary_token_ids = [
-        token for sentence in summary_lines_token_ids for token in sentence
-    ]
+    summary_token_ids = [token for sentence in summary_lines_token_ids for token in sentence]
 
     return story_token_ids, summary_token_ids
 
diff --git a/examples/summarization/utils_summarization_test.py b/examples/summarization/utils_summarization_test.py
index 8bfbf6ab23..253eae388d 100644
--- a/examples/summarization/utils_summarization_test.py
+++ b/examples/summarization/utils_summarization_test.py
@@ -33,25 +33,19 @@ class SummarizationDataProcessingTest(unittest.TestCase):
         """ Pad the sequence with 0 if the sequence is smaller than the block size."""
         sequence = [1, 2, 3, 4]
         expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]
-        self.assertEqual(
-            fit_to_block_size(sequence, self.block_size, 0), expected_output
-        )
+        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
 
     def test_fit_to_block_sequence_fit_exactly(self):
         """ Do nothing if the sequence is the right size. """
         sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
         expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        self.assertEqual(
-            fit_to_block_size(sequence, self.block_size, 0), expected_output
-        )
+        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
 
     def test_fit_to_block_sequence_too_big(self):
         """ Truncate the sequence if it is too long. """
         sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
         expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        self.assertEqual(
-            fit_to_block_size(sequence, self.block_size, 0), expected_output
-        )
+        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
 
     def test_process_story_no_highlights(self):
         """ Processing a story with no highlights returns an empty list for the summary.
@@ -95,9 +89,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
     def test_build_mask(self):
         sequence = torch.tensor([1, 2, 3, 4, 23, 23, 23])
         expected = torch.tensor([1, 1, 1, 1, 0, 0, 0])
-        np.testing.assert_array_equal(
-            build_mask(sequence, 23).numpy(), expected.numpy()
-        )
+        np.testing.assert_array_equal(build_mask(sequence, 23).numpy(), expected.numpy())
 
     def test_build_mask_with_padding_equal_to_one(self):
         sequence = torch.tensor([8, 2, 3, 4, 1, 1, 1])
@@ -106,12 +98,8 @@ class SummarizationDataProcessingTest(unittest.TestCase):
 
     def test_compute_token_type_ids(self):
         separator = 101
-        batch = torch.tensor(
-            [[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]]
-        )
-        expected = torch.tensor(
-            [[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1]]
-        )
+        batch = torch.tensor([[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]])
+        expected = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1]])
 
         result = compute_token_type_ids(batch, separator)
         np.testing.assert_array_equal(result, expected)
diff --git a/examples/test_examples.py b/examples/test_examples.py
index 632d2f728e..1293559c26 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -35,34 +35,36 @@ logging.basicConfig(level=logging.DEBUG)
 
 logger = logging.getLogger()
 
+
 def get_setup_file():
     parser = argparse.ArgumentParser()
-    parser.add_argument('-f')
+    parser.add_argument("-f")
     args = parser.parse_args()
     return args.f
 
-class ExamplesTests(unittest.TestCase):
 
+class ExamplesTests(unittest.TestCase):
     def test_run_glue(self):
         stream_handler = logging.StreamHandler(sys.stdout)
         logger.addHandler(stream_handler)
 
-        testargs = ["run_glue.py",
-                    "--data_dir=./examples/tests_samples/MRPC/",
-                    "--task_name=mrpc",
-                    "--do_train",
-                    "--do_eval",
-                    "--output_dir=./examples/tests_samples/temp_dir",
-                    "--per_gpu_train_batch_size=2",
-                    "--per_gpu_eval_batch_size=1",
-                    "--learning_rate=1e-4",
-                    "--max_steps=10",
-                    "--warmup_steps=2",
-                    "--overwrite_output_dir",
-                    "--seed=42"]
-        model_type, model_name = ("--model_type=bert",
-                                  "--model_name_or_path=bert-base-uncased")
-        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
+        testargs = [
+            "run_glue.py",
+            "--data_dir=./examples/tests_samples/MRPC/",
+            "--task_name=mrpc",
+            "--do_train",
+            "--do_eval",
+            "--output_dir=./examples/tests_samples/temp_dir",
+            "--per_gpu_train_batch_size=2",
+            "--per_gpu_eval_batch_size=1",
+            "--learning_rate=1e-4",
+            "--max_steps=10",
+            "--warmup_steps=2",
+            "--overwrite_output_dir",
+            "--seed=42",
+        ]
+        model_type, model_name = ("--model_type=bert", "--model_name_or_path=bert-base-uncased")
+        with patch.object(sys, "argv", testargs + [model_type, model_name]):
             result = run_glue.main()
             for value in result.values():
                 self.assertGreaterEqual(value, 0.75)
@@ -71,40 +73,38 @@ class ExamplesTests(unittest.TestCase):
         stream_handler = logging.StreamHandler(sys.stdout)
         logger.addHandler(stream_handler)
 
-        testargs = ["run_squad.py",
-                    "--data_dir=./examples/tests_samples/SQUAD",
-                    "--model_name=bert-base-uncased",
-                    "--output_dir=./examples/tests_samples/temp_dir",
-                    "--max_steps=10",
-                    "--warmup_steps=2",
-                    "--do_train",
-                    "--do_eval",
-                    "--version_2_with_negative",
-                    "--learning_rate=2e-4",
-                    "--per_gpu_train_batch_size=2",
-                    "--per_gpu_eval_batch_size=1",
-                    "--overwrite_output_dir",
-                    "--seed=42"]
-        model_type, model_name = ("--model_type=bert",
-                                  "--model_name_or_path=bert-base-uncased")
-        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
+        testargs = [
+            "run_squad.py",
+            "--data_dir=./examples/tests_samples/SQUAD",
+            "--model_name=bert-base-uncased",
+            "--output_dir=./examples/tests_samples/temp_dir",
+            "--max_steps=10",
+            "--warmup_steps=2",
+            "--do_train",
+            "--do_eval",
+            "--version_2_with_negative",
+            "--learning_rate=2e-4",
+            "--per_gpu_train_batch_size=2",
+            "--per_gpu_eval_batch_size=1",
+            "--overwrite_output_dir",
+            "--seed=42",
+        ]
+        model_type, model_name = ("--model_type=bert", "--model_name_or_path=bert-base-uncased")
+        with patch.object(sys, "argv", testargs + [model_type, model_name]):
             result = run_squad.main()
-            self.assertGreaterEqual(result['f1'], 30)
-            self.assertGreaterEqual(result['exact'], 30)
+            self.assertGreaterEqual(result["f1"], 30)
+            self.assertGreaterEqual(result["exact"], 30)
 
     def test_generation(self):
         stream_handler = logging.StreamHandler(sys.stdout)
         logger.addHandler(stream_handler)
 
-        testargs = ["run_generation.py",
-                    "--prompt=Hello",
-                    "--length=10",
-                    "--seed=42"]
-        model_type, model_name = ("--model_type=openai-gpt",
-                                  "--model_name_or_path=openai-gpt")
-        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
+        testargs = ["run_generation.py", "--prompt=Hello", "--length=10", "--seed=42"]
+        model_type, model_name = ("--model_type=openai-gpt", "--model_name_or_path=openai-gpt")
+        with patch.object(sys, "argv", testargs + [model_type, model_name]):
             result = run_generation.main()
             self.assertGreaterEqual(len(result), 10)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/examples/utils_multiple_choice.py b/examples/utils_multiple_choice.py
index a131a63924..492eb23e35 100644
--- a/examples/utils_multiple_choice.py
+++ b/examples/utils_multiple_choice.py
@@ -55,19 +55,10 @@ class InputExample(object):
 
 
 class InputFeatures(object):
-    def __init__(self,
-                 example_id,
-                 choices_features,
-                 label
-
-    ):
+    def __init__(self, example_id, choices_features, label):
         self.example_id = example_id
         self.choices_features = [
-            {
-                'input_ids': input_ids,
-                'input_mask': input_mask,
-                'segment_ids': segment_ids
-            }
+            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
             for input_ids, input_mask, segment_ids in choices_features
         ]
         self.label = label
@@ -99,29 +90,29 @@ class RaceProcessor(DataProcessor):
     def get_train_examples(self, data_dir):
         """See base class."""
         logger.info("LOOKING AT {} train".format(data_dir))
-        high = os.path.join(data_dir, 'train/high')
-        middle = os.path.join(data_dir, 'train/middle')
+        high = os.path.join(data_dir, "train/high")
+        middle = os.path.join(data_dir, "train/middle")
         high = self._read_txt(high)
         middle = self._read_txt(middle)
-        return self._create_examples(high + middle, 'train')
+        return self._create_examples(high + middle, "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
         logger.info("LOOKING AT {} dev".format(data_dir))
-        high = os.path.join(data_dir, 'dev/high')
-        middle = os.path.join(data_dir, 'dev/middle')
+        high = os.path.join(data_dir, "dev/high")
+        middle = os.path.join(data_dir, "dev/middle")
         high = self._read_txt(high)
         middle = self._read_txt(middle)
-        return self._create_examples(high + middle, 'dev')
+        return self._create_examples(high + middle, "dev")
 
     def get_test_examples(self, data_dir):
         """See base class."""
         logger.info("LOOKING AT {} test".format(data_dir))
-        high = os.path.join(data_dir, 'test/high')
-        middle = os.path.join(data_dir, 'test/middle')
+        high = os.path.join(data_dir, "test/high")
+        middle = os.path.join(data_dir, "test/middle")
         high = self._read_txt(high)
         middle = self._read_txt(middle)
-        return self._create_examples(high + middle, 'test')
+        return self._create_examples(high + middle, "test")
 
     def get_labels(self):
         """See base class."""
@@ -131,13 +122,12 @@ class RaceProcessor(DataProcessor):
         lines = []
         files = glob.glob(input_dir + "/*txt")
         for file in tqdm.tqdm(files, desc="read files"):
-            with open(file, 'r', encoding='utf-8') as fin:
+            with open(file, "r", encoding="utf-8") as fin:
                 data_raw = json.load(fin)
                 data_raw["race_id"] = file
                 lines.append(data_raw)
         return lines
 
-
     def _create_examples(self, lines, set_type):
         """Creates examples for the training and dev sets."""
         examples = []
@@ -145,19 +135,22 @@ class RaceProcessor(DataProcessor):
             race_id = "%s-%s" % (set_type, data_raw["race_id"])
             article = data_raw["article"]
             for i in range(len(data_raw["answers"])):
-                truth = str(ord(data_raw['answers'][i]) - ord('A'))
-                question = data_raw['questions'][i]
-                options = data_raw['options'][i]
+                truth = str(ord(data_raw["answers"][i]) - ord("A"))
+                question = data_raw["questions"][i]
+                options = data_raw["options"][i]
 
                 examples.append(
                     InputExample(
                         example_id=race_id,
                         question=question,
-                        contexts=[article, article, article, article], # this is not efficient but convenient
+                        contexts=[article, article, article, article],  # this is not efficient but convenient
                         endings=[options[0], options[1], options[2], options[3]],
-                        label=truth))
+                        label=truth,
+                    )
+                )
         return examples
 
+
 class SwagProcessor(DataProcessor):
     """Processor for the SWAG data set."""
 
@@ -179,27 +172,25 @@ class SwagProcessor(DataProcessor):
             "setting!"
         )
         return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")
+
     def get_labels(self):
         """See base class."""
         return ["0", "1", "2", "3"]
 
     def _read_csv(self, input_file):
-        with open(input_file, 'r', encoding='utf-8') as f:
+        with open(input_file, "r", encoding="utf-8") as f:
             reader = csv.reader(f)
             lines = []
             for line in reader:
                 if sys.version_info[0] == 2:
-                    line = list(unicode(cell, 'utf-8') for cell in line)
+                    line = list(unicode(cell, "utf-8") for cell in line)
                 lines.append(line)
             return lines
 
-
     def _create_examples(self, lines: List[List[str]], type: str):
         """Creates examples for the training and dev sets."""
-        if type == "train" and lines[0][-1] != 'label':
-            raise ValueError(
-                "For training, the input file must contain a label column."
-            )
+        if type == "train" and lines[0][-1] != "label":
+            raise ValueError("For training, the input file must contain a label column.")
 
         examples = [
             InputExample(
@@ -207,10 +198,11 @@ class SwagProcessor(DataProcessor):
                 question=line[5],  # in the swag dataset, the
                 # common beginning of each
                 # choice is stored in "sent2".
-                contexts = [line[4], line[4], line[4], line[4]],
-                endings = [line[7], line[8], line[9], line[10]],
-                label=line[11]
-            ) for line in lines[1:]  # we skip the line with the column names
+                contexts=[line[4], line[4], line[4], line[4]],
+                endings=[line[7], line[8], line[9], line[10]],
+                label=line[11],
+            )
+            for line in lines[1:]  # we skip the line with the column names
         ]
 
         return examples
@@ -238,15 +230,14 @@ class ArcProcessor(DataProcessor):
         return ["0", "1", "2", "3"]
 
     def _read_json(self, input_file):
-        with open(input_file, 'r', encoding='utf-8') as fin:
+        with open(input_file, "r", encoding="utf-8") as fin:
             lines = fin.readlines()
             return lines
 
-
     def _create_examples(self, lines, type):
         """Creates examples for the training and dev sets."""
 
-        #There are two types of labels. They should be normalized
+        # There are two types of labels. They should be normalized
         def normalize(truth):
             if truth in "ABCD":
                 return ord(truth) - ord("A")
@@ -283,12 +274,18 @@ class ArcProcessor(DataProcessor):
             if len(options) == 4:
                 examples.append(
                     InputExample(
-                        example_id = id,
+                        example_id=id,
                         question=question,
-                        contexts=[options[0]["para"].replace("_", ""), options[1]["para"].replace("_", ""),
-                                  options[2]["para"].replace("_", ""), options[3]["para"].replace("_", "")],
+                        contexts=[
+                            options[0]["para"].replace("_", ""),
+                            options[1]["para"].replace("_", ""),
+                            options[2]["para"].replace("_", ""),
+                            options[3]["para"].replace("_", ""),
+                        ],
                         endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]],
-                        label=truth))
+                        label=truth,
+                    )
+                )
 
         if type == "train":
             assert len(examples) > 1
@@ -316,7 +313,7 @@ def convert_examples_to_features(
     Loads a data file into a list of `InputFeatures`
     """
 
-    label_map = {label : i for i, label in enumerate(label_list)}
+    label_map = {label: i for i, label in enumerate(label_list)}
 
     features = []
     for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
@@ -331,16 +328,13 @@ def convert_examples_to_features(
             else:
                 text_b = example.question + " " + ending
 
-            inputs = tokenizer.encode_plus(
-                text_a,
-                text_b,
-                add_special_tokens=True,
-                max_length=max_length,
-            )
-            if 'num_truncated_tokens' in inputs and inputs['num_truncated_tokens'] > 0:
-                logger.info('Attention! you are cropping tokens (swag task is ok). '
-                        'If you are training ARC and RACE and you are poping question + options,'
-                        'you need to try to use a bigger max seq length!')
+            inputs = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, max_length=max_length,)
+            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
+                logger.info(
+                    "Attention! you are cropping tokens (swag task is ok). "
+                    "If you are training ARC and RACE and you are poping question + options,"
+                    "you need to try to use a bigger max seq length!"
+                )
 
             input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
 
@@ -364,7 +358,6 @@ def convert_examples_to_features(
             assert len(token_type_ids) == max_length
             choices_features.append((input_ids, attention_mask, token_type_ids))
 
-
         label = label_map[example.label]
 
         if ex_index < 2:
@@ -372,33 +365,17 @@ def convert_examples_to_features(
             logger.info("race_id: {}".format(example.example_id))
             for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features):
                 logger.info("choice: {}".format(choice_idx))
-                logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
-                logger.info("attention_mask: {}".format(' '.join(map(str, attention_mask))))
-                logger.info("token_type_ids: {}".format(' '.join(map(str, token_type_ids))))
+                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
+                logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask))))
+                logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids))))
                 logger.info("label: {}".format(label))
 
-        features.append(
-            InputFeatures(
-                example_id=example.example_id,
-                choices_features=choices_features,
-                label=label,
-            )
-        )
+        features.append(InputFeatures(example_id=example.example_id, choices_features=choices_features, label=label,))
 
     return features
 
 
+processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor}
 
 
-processors = {
-    "race": RaceProcessor,
-    "swag": SwagProcessor,
-    "arc": ArcProcessor
-}
-
-
-MULTIPLE_CHOICE_TASKS_NUM_LABELS = {
-    "race", 4,
-    "swag", 4,
-    "arc", 4
-}
+MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4}
diff --git a/examples/utils_ner.py b/examples/utils_ner.py
index 45ddeafbd5..d37583469c 100644
--- a/examples/utils_ner.py
+++ b/examples/utils_ner.py
@@ -61,9 +61,7 @@ def read_examples_from_file(data_dir, mode):
         for line in f:
             if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                 if words:
-                    examples.append(InputExample(guid="{}-{}".format(mode, guid_index),
-                                                 words=words,
-                                                 labels=labels))
+                    examples.append(InputExample(guid="{}-{}".format(mode, guid_index), words=words, labels=labels))
                     guid_index += 1
                     words = []
                     labels = []
@@ -76,27 +74,27 @@ def read_examples_from_file(data_dir, mode):
                     # Examples could have no label for mode = "test"
                     labels.append("O")
         if words:
-            examples.append(InputExample(guid="%s-%d".format(mode, guid_index),
-                                         words=words,
-                                         labels=labels))
+            examples.append(InputExample(guid="%s-%d".format(mode, guid_index), words=words, labels=labels))
     return examples
 
 
-def convert_examples_to_features(examples,
-                                 label_list,
-                                 max_seq_length,
-                                 tokenizer,
-                                 cls_token_at_end=False,
-                                 cls_token="[CLS]",
-                                 cls_token_segment_id=1,
-                                 sep_token="[SEP]",
-                                 sep_token_extra=False,
-                                 pad_on_left=False,
-                                 pad_token=0,
-                                 pad_token_segment_id=0,
-                                 pad_token_label_id=-100,
-                                 sequence_a_segment_id=0,
-                                 mask_padding_with_zero=True):
+def convert_examples_to_features(
+    examples,
+    label_list,
+    max_seq_length,
+    tokenizer,
+    cls_token_at_end=False,
+    cls_token="[CLS]",
+    cls_token_segment_id=1,
+    sep_token="[SEP]",
+    sep_token_extra=False,
+    pad_on_left=False,
+    pad_token=0,
+    pad_token_segment_id=0,
+    pad_token_label_id=-100,
+    sequence_a_segment_id=0,
+    mask_padding_with_zero=True,
+):
     """ Loads a data file into a list of `InputBatch`s
         `cls_token_at_end` define the location of the CLS token:
             - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
@@ -122,8 +120,8 @@ def convert_examples_to_features(examples,
         # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
         special_tokens_count = 3 if sep_token_extra else 2
         if len(tokens) > max_seq_length - special_tokens_count:
-            tokens = tokens[:(max_seq_length - special_tokens_count)]
-            label_ids = label_ids[:(max_seq_length - special_tokens_count)]
+            tokens = tokens[: (max_seq_length - special_tokens_count)]
+            label_ids = label_ids[: (max_seq_length - special_tokens_count)]
 
         # The convention in BERT is:
         # (a) For sequence pairs:
@@ -174,10 +172,10 @@ def convert_examples_to_features(examples,
             segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
             label_ids = ([pad_token_label_id] * padding_length) + label_ids
         else:
-            input_ids += ([pad_token] * padding_length)
-            input_mask += ([0 if mask_padding_with_zero else 1] * padding_length)
-            segment_ids += ([pad_token_segment_id] * padding_length)
-            label_ids += ([pad_token_label_id] * padding_length)
+            input_ids += [pad_token] * padding_length
+            input_mask += [0 if mask_padding_with_zero else 1] * padding_length
+            segment_ids += [pad_token_segment_id] * padding_length
+            label_ids += [pad_token_label_id] * padding_length
 
         assert len(input_ids) == max_seq_length
         assert len(input_mask) == max_seq_length
@@ -194,10 +192,8 @@ def convert_examples_to_features(examples,
             logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))
 
         features.append(
-                InputFeatures(input_ids=input_ids,
-                              input_mask=input_mask,
-                              segment_ids=segment_ids,
-                              label_ids=label_ids))
+            InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids)
+        )
     return features
 
 
@@ -209,4 +205,4 @@ def get_labels(path):
             labels = ["O"] + labels
         return labels
     else:
-        return ["O", "B-MISC", "I-MISC",  "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
+        return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
diff --git a/hubconf.py b/hubconf.py
index 3fa354ed5a..1d100271ae 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,9 +1,15 @@
 from transformers import (
-    AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
+    AutoTokenizer,
+    AutoConfig,
+    AutoModel,
+    AutoModelWithLMHead,
+    AutoModelForSequenceClassification,
+    AutoModelForQuestionAnswering,
 )
 from transformers.file_utils import add_start_docstrings
 
-dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
+dependencies = ["torch", "tqdm", "boto3", "requests", "regex", "sentencepiece", "sacremoses"]
+
 
 @add_start_docstrings(AutoConfig.__doc__)
 def config(*args, **kwargs):
@@ -57,6 +63,7 @@ def model(*args, **kwargs):
 
     return AutoModel.from_pretrained(*args, **kwargs)
 
+
 @add_start_docstrings(AutoModelWithLMHead.__doc__)
 def modelWithLMHead(*args, **kwargs):
     r"""
diff --git a/setup.py b/setup.py
index fe2e1526bf..59dbfef12b 100644
--- a/setup.py
+++ b/setup.py
@@ -38,11 +38,11 @@ from setuptools import find_packages, setup
 
 
 extras = {
-    'serving': ['pydantic', 'uvicorn', 'fastapi'],
-    'serving-tf': ['pydantic', 'uvicorn', 'fastapi', 'tensorflow'],
-    'serving-torch': ['pydantic', 'uvicorn', 'fastapi', 'torch']
+    "serving": ["pydantic", "uvicorn", "fastapi"],
+    "serving-tf": ["pydantic", "uvicorn", "fastapi", "tensorflow"],
+    "serving-torch": ["pydantic", "uvicorn", "fastapi", "torch"],
 }
-extras['all'] = [package for package in extras.values()]
+extras["all"] = [package for package in extras.values()]
 
 setup(
     name="transformers",
@@ -50,30 +50,29 @@ setup(
     author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
     author_email="thomas@huggingface.co",
     description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
-    long_description=open("README.md", "r", encoding='utf-8').read(),
+    long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
-    keywords='NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU',
-    license='Apache',
+    keywords="NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU",
+    license="Apache",
     url="https://github.com/huggingface/transformers",
-    packages=find_packages(exclude=["*.tests", "*.tests.*",
-                                    "tests.*", "tests"]),
-    install_requires=['numpy',
-                      'boto3',
-                      'filelock',
-                      'requests',
-                      'tqdm',
-                      'regex != 2019.12.17',
-                      'sentencepiece',
-                      'sacremoses'],
-    extras_require=extras,
-    scripts=[
-        'transformers-cli'
+    packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
+    install_requires=[
+        "numpy",
+        "boto3",
+        "filelock",
+        "requests",
+        "tqdm",
+        "regex != 2019.12.17",
+        "sentencepiece",
+        "sacremoses",
     ],
+    extras_require=extras,
+    scripts=["transformers-cli"],
     # python_requires='>=3.5.0',
     classifiers=[
-          'Intended Audience :: Science/Research',
-          'License :: OSI Approved :: Apache Software License',
-          'Programming Language :: Python :: 3',
-          'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Programming Language :: Python :: 3",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
 )
diff --git a/templates/adding_a_new_example_script/run_xxx.py b/templates/adding_a_new_example_script/run_xxx.py
index 77ce587a54..64e92f2a28 100644
--- a/templates/adding_a_new_example_script/run_xxx.py
+++ b/templates/adding_a_new_example_script/run_xxx.py
@@ -24,8 +24,7 @@ import glob
 
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 
 try:
@@ -35,19 +34,32 @@ except:
 
 from tqdm import tqdm, trange
 
-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForQuestionAnswering, BertTokenizer,
-                                  XLMConfig, XLMForQuestionAnswering,
-                                  XLMTokenizer, XLNetConfig,
-                                  XLNetForQuestionAnswering,
-                                  XLNetTokenizer,
-                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForQuestionAnswering,
+    BertTokenizer,
+    XLMConfig,
+    XLMForQuestionAnswering,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetForQuestionAnswering,
+    XLNetTokenizer,
+    DistilBertConfig,
+    DistilBertForQuestionAnswering,
+    DistilBertTokenizer,
+)
 
 from transformers import AdamW, get_linear_schedule_with_warmup
 
-from utils_squad import (read_squad_examples, convert_examples_to_features,
-                         RawResult, write_predictions,
-                         RawResultExtended, write_predictions_extended)
+from utils_squad import (
+    read_squad_examples,
+    convert_examples_to_features,
+    RawResult,
+    write_predictions,
+    RawResultExtended,
+    write_predictions_extended,
+)
 
 # The follwing import is the official SQuAD evaluation script (2.0).
 # You can remove it from the dependencies if you are using this script outside of the library
@@ -56,16 +68,18 @@ from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
-                  for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), ()
+)
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
-    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
-    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+    "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
+    "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
 }
 
+
 def set_seed(args):
     random.seed(args.seed)
     np.random.seed(args.seed)
@@ -73,9 +87,11 @@ def set_seed(args):
     if args.n_gpu > 0:
         torch.cuda.manual_seed_all(args.seed)
 
+
 def to_list(tensor):
     return tensor.detach().cpu().tolist()
 
+
 def train(args, train_dataset, model, tokenizer):
     """ Train the model """
     if args.local_rank in [-1, 0]:
@@ -92,13 +108,18 @@ def train(args, train_dataset, model, tokenizer):
         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
     if args.fp16:
         try:
             from apex import amp
@@ -112,17 +133,21 @@ def train(args, train_dataset, model, tokenizer):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -136,20 +161,21 @@ def train(args, train_dataset, model, tokenizer):
         for step, batch in enumerate(epoch_iterator):
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':       batch[0],
-                      'attention_mask':  batch[1],
-                      'start_positions': batch[3],
-                      'end_positions':   batch[4]}
-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
-            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[5],
-                               'p_mask':       batch[6]})
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                "start_positions": batch[3],
+                "end_positions": batch[4],
+            }
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
             outputs = model(**inputs)
             loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
             if args.gradient_accumulation_steps > 1:
                 loss = loss / args.gradient_accumulation_steps
 
@@ -173,22 +199,26 @@ def train(args, train_dataset, model, tokenizer):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                     logging_loss = tr_loss
 
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                     # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
@@ -224,32 +254,31 @@ def evaluate(args, model, tokenizer, prefix=""):
         model.eval()
         batch = tuple(t.to(args.device) for t in batch)
         with torch.no_grad():
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1]
-                      }
-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]  # XLM don't use segment_ids
             example_indices = batch[3]
-            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[4],
-                               'p_mask':    batch[5]})
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
             outputs = model(**inputs)
 
         for i, example_index in enumerate(example_indices):
             eval_feature = features[example_index.item()]
             unique_id = int(eval_feature.unique_id)
-            if args.model_type in ['xlnet', 'xlm']:
+            if args.model_type in ["xlnet", "xlm"]:
                 # XLNet uses a more complex post-processing procedure
-                result = RawResultExtended(unique_id            = unique_id,
-                                           start_top_log_probs  = to_list(outputs[0][i]),
-                                           start_top_index      = to_list(outputs[1][i]),
-                                           end_top_log_probs    = to_list(outputs[2][i]),
-                                           end_top_index        = to_list(outputs[3][i]),
-                                           cls_logits           = to_list(outputs[4][i]))
+                result = RawResultExtended(
+                    unique_id=unique_id,
+                    start_top_log_probs=to_list(outputs[0][i]),
+                    start_top_index=to_list(outputs[1][i]),
+                    end_top_log_probs=to_list(outputs[2][i]),
+                    end_top_index=to_list(outputs[3][i]),
+                    cls_logits=to_list(outputs[4][i]),
+                )
             else:
-                result = RawResult(unique_id    = unique_id,
-                                   start_logits = to_list(outputs[0][i]),
-                                   end_logits   = to_list(outputs[1][i]))
+                result = RawResult(
+                    unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i])
+                )
             all_results.append(result)
 
     # Compute predictions
@@ -260,23 +289,44 @@ def evaluate(args, model, tokenizer, prefix=""):
     else:
         output_null_log_odds_file = None
 
-    if args.model_type in ['xlnet', 'xlm']:
+    if args.model_type in ["xlnet", "xlm"]:
         # XLNet uses a more complex post-processing procedure
-        write_predictions_extended(examples, features, all_results, args.n_best_size,
-                        args.max_answer_length, output_prediction_file,
-                        output_nbest_file, output_null_log_odds_file, args.predict_file,
-                        model.config.start_n_top, model.config.end_n_top,
-                        args.version_2_with_negative, tokenizer, args.verbose_logging)
+        write_predictions_extended(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            args.predict_file,
+            model.config.start_n_top,
+            model.config.end_n_top,
+            args.version_2_with_negative,
+            tokenizer,
+            args.verbose_logging,
+        )
     else:
-        write_predictions(examples, features, all_results, args.n_best_size,
-                        args.max_answer_length, args.do_lower_case, output_prediction_file,
-                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
-                        args.version_2_with_negative, args.null_score_diff_threshold)
+        write_predictions(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            args.do_lower_case,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            args.verbose_logging,
+            args.version_2_with_negative,
+            args.null_score_diff_threshold,
+        )
 
     # Evaluate with the official SQuAD script
-    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
-                                 pred_file=output_prediction_file,
-                                 na_prob_file=output_null_log_odds_file)
+    evaluate_options = EVAL_OPTS(
+        data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file
+    )
     results = evaluate_on_squad(evaluate_options)
     return results
 
@@ -287,24 +337,30 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
 
     # Load data features from cache or dataset file
     input_file = args.predict_file if evaluate else args.train_file
-    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
-        'dev' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length)))
+    cached_features_file = os.path.join(
+        os.path.dirname(input_file),
+        "cached_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+        ),
+    )
     if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", input_file)
-        examples = read_squad_examples(input_file=input_file,
-                                                is_training=not evaluate,
-                                                version_2_with_negative=args.version_2_with_negative)
-        features = convert_examples_to_features(examples=examples,
-                                                tokenizer=tokenizer,
-                                                max_seq_length=args.max_seq_length,
-                                                doc_stride=args.doc_stride,
-                                                max_query_length=args.max_query_length,
-                                                is_training=not evaluate)
+        examples = read_squad_examples(
+            input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative
+        )
+        features = convert_examples_to_features(
+            examples=examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=not evaluate,
+        )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
@@ -320,14 +376,21 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
     all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
     if evaluate:
         all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_example_index, all_cls_index, all_p_mask)
+        dataset = TensorDataset(
+            all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask
+        )
     else:
         all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
         all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_start_positions, all_end_positions,
-                                all_cls_index, all_p_mask)
+        dataset = TensorDataset(
+            all_input_ids,
+            all_input_mask,
+            all_segment_ids,
+            all_start_positions,
+            all_end_positions,
+            all_cls_index,
+            all_p_mask,
+        )
 
     if output_examples:
         return dataset, examples, features
@@ -338,109 +401,190 @@ def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--train_file", default=None, type=str, required=True,
-                        help="SQuAD json for training. E.g., train-v1.1.json")
-    parser.add_argument("--predict_file", default=None, type=str, required=True,
-                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model checkpoints and predictions will be written.")
+    parser.add_argument(
+        "--train_file", default=None, type=str, required=True, help="SQuAD json for training. E.g., train-v1.1.json"
+    )
+    parser.add_argument(
+        "--predict_file",
+        default=None,
+        type=str,
+        required=True,
+        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model checkpoints and predictions will be written.",
+    )
 
     ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
 
-    parser.add_argument('--version_2_with_negative', action='store_true',
-                        help='If true, the SQuAD examples contain some that do not have an answer.')
-    parser.add_argument('--null_score_diff_threshold', type=float, default=0.0,
-                        help="If null_score - best_non_null is greater than the threshold predict null.")
+    parser.add_argument(
+        "--version_2_with_negative",
+        action="store_true",
+        help="If true, the SQuAD examples contain some that do not have an answer.",
+    )
+    parser.add_argument(
+        "--null_score_diff_threshold",
+        type=float,
+        default=0.0,
+        help="If null_score - best_non_null is greater than the threshold predict null.",
+    )
 
-    parser.add_argument("--max_seq_length", default=384, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
-                             "longer than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--doc_stride", default=128, type=int,
-                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
-    parser.add_argument("--max_query_length", default=64, type=int,
-                        help="The maximum number of tokens for the question. Questions longer than this will "
-                             "be truncated to this length.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--max_seq_length",
+        default=384,
+        type=int,
+        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+        "longer than this will be truncated, and sequences shorter than this will be padded.",
+    )
+    parser.add_argument(
+        "--doc_stride",
+        default=128,
+        type=int,
+        help="When splitting up a long document into chunks, how much stride to take between chunks.",
+    )
+    parser.add_argument(
+        "--max_query_length",
+        default=64,
+        type=int,
+        help="The maximum number of tokens for the question. Questions longer than this will "
+        "be truncated to this length.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
 
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-    parser.add_argument("--n_best_size", default=20, type=int,
-                        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
-    parser.add_argument("--max_answer_length", default=30, type=int,
-                        help="The maximum length of an answer that can be generated. This is needed because the start "
-                             "and end predictions are not conditioned on one another.")
-    parser.add_argument("--verbose_logging", action='store_true',
-                        help="If true, all of the warnings related to data processing will be printed. "
-                             "A number of warnings are expected for a normal SQuAD evaluation.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+    parser.add_argument(
+        "--n_best_size",
+        default=20,
+        type=int,
+        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
+    )
+    parser.add_argument(
+        "--max_answer_length",
+        default=30,
+        type=int,
+        help="The maximum length of an answer that can be generated. This is needed because the start "
+        "and end predictions are not conditioned on one another.",
+    )
+    parser.add_argument(
+        "--verbose_logging",
+        action="store_true",
+        help="If true, all of the warnings related to data processing will be printed. "
+        "A number of warnings are expected for a normal SQuAD evaluation.",
+    )
 
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
 
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
     args = parser.parse_args()
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -452,16 +596,24 @@ def main():
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
         args.n_gpu = 1
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
@@ -472,15 +624,21 @@ def main():
 
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -495,7 +653,8 @@ def main():
     if args.fp16:
         try:
             import apex
-            apex.amp.register_half_function(torch, 'einsum')
+
+            apex.amp.register_half_function(torch, "einsum")
         except ImportError:
             raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
 
@@ -505,7 +664,6 @@ def main():
         global_step, tr_loss = train(args, train_dataset, model, tokenizer)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-
     # Save the trained model and the tokenizer
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
@@ -515,39 +673,42 @@ def main():
         logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         model.to(args.device)
 
-
     # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
 
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
 
         for checkpoint in checkpoints:
             # Reload the model
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
 
             # Evaluate
             result = evaluate(args, model, tokenizer, prefix=global_step)
 
-            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
+            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
             results.update(result)
 
     logger.info("Results: {}".format(results))
diff --git a/templates/adding_a_new_example_script/utils_xxx.py b/templates/adding_a_new_example_script/utils_xxx.py
index 3f4145e028..bd016bd306 100644
--- a/templates/adding_a_new_example_script/utils_xxx.py
+++ b/templates/adding_a_new_example_script/utils_xxx.py
@@ -1,4 +1,3 @@
-
 # coding=utf-8
 # Copyright 2018 XXX.  All rights reserved.
 #
@@ -37,14 +36,16 @@ class SquadExample(object):
     For examples without an answer, the start and end position are -1.
     """
 
-    def __init__(self,
-                 qas_id,
-                 question_text,
-                 doc_tokens,
-                 orig_answer_text=None,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
+    def __init__(
+        self,
+        qas_id,
+        question_text,
+        doc_tokens,
+        orig_answer_text=None,
+        start_position=None,
+        end_position=None,
+        is_impossible=None,
+    ):
         self.qas_id = qas_id
         self.question_text = question_text
         self.doc_tokens = doc_tokens
@@ -59,8 +60,7 @@ class SquadExample(object):
     def __repr__(self):
         s = ""
         s += "qas_id: %s" % (self.qas_id)
-        s += ", question_text: %s" % (
-            self.question_text)
+        s += ", question_text: %s" % (self.question_text)
         s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
         if self.start_position:
             s += ", start_position: %d" % (self.start_position)
@@ -74,22 +74,24 @@ class SquadExample(object):
 class InputFeatures(object):
     """A single set of features of data."""
 
-    def __init__(self,
-                 unique_id,
-                 example_index,
-                 doc_span_index,
-                 tokens,
-                 token_to_orig_map,
-                 token_is_max_context,
-                 input_ids,
-                 input_mask,
-                 segment_ids,
-                 cls_index,
-                 p_mask,
-                 paragraph_len,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
+    def __init__(
+        self,
+        unique_id,
+        example_index,
+        doc_span_index,
+        tokens,
+        token_to_orig_map,
+        token_is_max_context,
+        input_ids,
+        input_mask,
+        segment_ids,
+        cls_index,
+        p_mask,
+        paragraph_len,
+        start_position=None,
+        end_position=None,
+        is_impossible=None,
+    ):
         self.unique_id = unique_id
         self.example_index = example_index
         self.doc_span_index = doc_span_index
@@ -109,7 +111,7 @@ class InputFeatures(object):
 
 def read_squad_examples(input_file, is_training, version_2_with_negative):
     """Read a SQuAD json file into a list of SquadExample."""
-    with open(input_file, "r", encoding='utf-8') as reader:
+    with open(input_file, "r", encoding="utf-8") as reader:
         input_data = json.load(reader)["data"]
 
     def is_whitespace(c):
@@ -146,8 +148,7 @@ def read_squad_examples(input_file, is_training, version_2_with_negative):
                     if version_2_with_negative:
                         is_impossible = qa["is_impossible"]
                     if (len(qa["answers"]) != 1) and (not is_impossible):
-                        raise ValueError(
-                            "For training, each question should have exactly 1 answer.")
+                        raise ValueError("For training, each question should have exactly 1 answer.")
                     if not is_impossible:
                         answer = qa["answers"][0]
                         orig_answer_text = answer["text"]
@@ -161,12 +162,10 @@ def read_squad_examples(input_file, is_training, version_2_with_negative):
                         #
                         # Note that this means for training mode, every example is NOT
                         # guaranteed to be preserved.
-                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
-                        cleaned_answer_text = " ".join(
-                            whitespace_tokenize(orig_answer_text))
+                        actual_text = " ".join(doc_tokens[start_position : (end_position + 1)])
+                        cleaned_answer_text = " ".join(whitespace_tokenize(orig_answer_text))
                         if actual_text.find(cleaned_answer_text) == -1:
-                            logger.warning("Could not find answer: '%s' vs. '%s'",
-                                           actual_text, cleaned_answer_text)
+                            logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
                             continue
                     else:
                         start_position = -1
@@ -180,18 +179,29 @@ def read_squad_examples(input_file, is_training, version_2_with_negative):
                     orig_answer_text=orig_answer_text,
                     start_position=start_position,
                     end_position=end_position,
-                    is_impossible=is_impossible)
+                    is_impossible=is_impossible,
+                )
                 examples.append(example)
     return examples
 
 
-def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 doc_stride, max_query_length, is_training,
-                                 cls_token_at_end=False,
-                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
-                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
-                                 cls_token_segment_id=0, pad_token_segment_id=0,
-                                 mask_padding_with_zero=True):
+def convert_examples_to_features(
+    examples,
+    tokenizer,
+    max_seq_length,
+    doc_stride,
+    max_query_length,
+    is_training,
+    cls_token_at_end=False,
+    cls_token="[CLS]",
+    sep_token="[SEP]",
+    pad_token=0,
+    sequence_a_segment_id=0,
+    sequence_b_segment_id=1,
+    cls_token_segment_id=0,
+    pad_token_segment_id=0,
+    mask_padding_with_zero=True,
+):
     """Loads a data file into a list of `InputBatch`s."""
 
     unique_id = 1000000000
@@ -232,8 +242,8 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
             else:
                 tok_end_position = len(all_doc_tokens) - 1
             (tok_start_position, tok_end_position) = _improve_answer_span(
-                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
-                example.orig_answer_text)
+                all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text
+            )
 
         # The -3 accounts for [CLS], [SEP] and [SEP]
         max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
@@ -241,8 +251,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
         # We can have documents that are longer than the maximum sequence length.
         # To deal with this we do a sliding window approach, where we take chunks
         # of the up to our max length with a stride of `doc_stride`.
-        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-            "DocSpan", ["start", "length"])
+        _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])  # pylint: disable=invalid-name
         doc_spans = []
         start_offset = 0
         while start_offset < len(all_doc_tokens):
@@ -287,8 +296,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                 split_token_index = doc_span.start + i
                 token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
 
-                is_max_context = _check_is_max_context(doc_spans, doc_span_index,
-                                                       split_token_index)
+                is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index)
                 token_is_max_context[len(tokens)] = is_max_context
                 tokens.append(all_doc_tokens[split_token_index])
                 segment_ids.append(sequence_b_segment_id)
@@ -333,8 +341,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                 doc_start = doc_span.start
                 doc_end = doc_span.start + doc_span.length - 1
                 out_of_span = False
-                if not (tok_start_position >= doc_start and
-                        tok_end_position <= doc_end):
+                if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
                     out_of_span = True
                 if out_of_span:
                     start_position = 0
@@ -355,24 +362,23 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                 logger.info("example_index: %s" % (example_index))
                 logger.info("doc_span_index: %s" % (doc_span_index))
                 logger.info("tokens: %s" % " ".join(tokens))
-                logger.info("token_to_orig_map: %s" % " ".join([
-                    "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
-                logger.info("token_is_max_context: %s" % " ".join([
-                    "%d:%s" % (x, y) for (x, y) in token_is_max_context.items()
-                ]))
+                logger.info(
+                    "token_to_orig_map: %s" % " ".join(["%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])
+                )
+                logger.info(
+                    "token_is_max_context: %s"
+                    % " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()])
+                )
                 logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                logger.info(
-                    "input_mask: %s" % " ".join([str(x) for x in input_mask]))
-                logger.info(
-                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+                logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
                 if is_training and span_is_impossible:
                     logger.info("impossible example")
                 if is_training and not span_is_impossible:
-                    answer_text = " ".join(tokens[start_position:(end_position + 1)])
+                    answer_text = " ".join(tokens[start_position : (end_position + 1)])
                     logger.info("start_position: %d" % (start_position))
                     logger.info("end_position: %d" % (end_position))
-                    logger.info(
-                        "answer: %s" % (answer_text))
+                    logger.info("answer: %s" % (answer_text))
 
             features.append(
                 InputFeatures(
@@ -390,14 +396,15 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                     paragraph_len=paragraph_len,
                     start_position=start_position,
                     end_position=end_position,
-                    is_impossible=span_is_impossible))
+                    is_impossible=span_is_impossible,
+                )
+            )
             unique_id += 1
 
     return features
 
 
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
-                         orig_answer_text):
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
     """Returns tokenized answer spans that better match the annotated answer."""
 
     # The SQuAD annotations are character based. We first project them to
@@ -426,7 +433,7 @@ def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
 
     for new_start in range(input_start, input_end + 1):
         for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
             if text_span == tok_answer_text:
                 return (new_start, new_end)
 
@@ -470,13 +477,23 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
     return cur_span_index == best_span_index
 
 
-RawResult = collections.namedtuple("RawResult",
-                                   ["unique_id", "start_logits", "end_logits"])
+RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"])
 
-def write_predictions(all_examples, all_features, all_results, n_best_size,
-                      max_answer_length, do_lower_case, output_prediction_file,
-                      output_nbest_file, output_null_log_odds_file, verbose_logging,
-                      version_2_with_negative, null_score_diff_threshold):
+
+def write_predictions(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    do_lower_case,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    verbose_logging,
+    version_2_with_negative,
+    null_score_diff_threshold,
+):
     """Write final predictions to the json file and log-odds of null if needed."""
     logger.info("Writing predictions to: %s" % (output_prediction_file))
     logger.info("Writing nbest to: %s" % (output_nbest_file))
@@ -490,8 +507,8 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
         unique_id_to_result[result.unique_id] = result
 
     _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction",
-        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
+    )
 
     all_predictions = collections.OrderedDict()
     all_nbest_json = collections.OrderedDict()
@@ -544,7 +561,9 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                             start_index=start_index,
                             end_index=end_index,
                             start_logit=result.start_logits[start_index],
-                            end_logit=result.end_logits[end_index]))
+                            end_logit=result.end_logits[end_index],
+                        )
+                    )
         if version_2_with_negative:
             prelim_predictions.append(
                 _PrelimPrediction(
@@ -552,14 +571,14 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                     start_index=0,
                     end_index=0,
                     start_logit=null_start_logit,
-                    end_logit=null_end_logit))
-        prelim_predictions = sorted(
-            prelim_predictions,
-            key=lambda x: (x.start_logit + x.end_logit),
-            reverse=True)
+                    end_logit=null_end_logit,
+                )
+            )
+        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
 
         _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-            "NbestPrediction", ["text", "start_logit", "end_logit"])
+            "NbestPrediction", ["text", "start_logit", "end_logit"]
+        )
 
         seen_predictions = {}
         nbest = []
@@ -568,10 +587,10 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                 break
             feature = features[pred.feature_index]
             if pred.start_index > 0:  # this is a non-null prediction
-                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+                tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
                 orig_doc_start = feature.token_to_orig_map[pred.start_index]
                 orig_doc_end = feature.token_to_orig_map[pred.end_index]
-                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+                orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
                 tok_text = " ".join(tok_tokens)
 
                 # De-tokenize WordPieces that have been split off.
@@ -592,31 +611,21 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                 final_text = ""
                 seen_predictions[final_text] = True
 
-            nbest.append(
-                _NbestPrediction(
-                    text=final_text,
-                    start_logit=pred.start_logit,
-                    end_logit=pred.end_logit))
+            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
         # if we didn't include the empty option in the n-best, include it
         if version_2_with_negative:
             if "" not in seen_predictions:
-                nbest.append(
-                    _NbestPrediction(
-                        text="",
-                        start_logit=null_start_logit,
-                        end_logit=null_end_logit))
-                
+                nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
+
             # In very rare edge cases we could only have single null prediction.
             # So we just create a nonce prediction in this case to avoid failure.
-            if len(nbest)==1:
-                nbest.insert(0,
-                    _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+            if len(nbest) == 1:
+                nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
 
         # In very rare edge cases we could have no valid predictions. So we
         # just create a nonce prediction in this case to avoid failure.
         if not nbest:
-            nbest.append(
-                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+            nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
 
         assert len(nbest) >= 1
 
@@ -645,8 +654,7 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
             all_predictions[example.qas_id] = nbest_json[0]["text"]
         else:
             # predict "" iff the null score - the score of best non-null > threshold
-            score_diff = score_null - best_non_null_entry.start_logit - (
-                best_non_null_entry.end_logit)
+            score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
             scores_diff_json[example.qas_id] = score_diff
             if score_diff > null_score_diff_threshold:
                 all_predictions[example.qas_id] = ""
@@ -668,29 +676,40 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
 
 
 # For XLNet (and XLM which uses the same head)
-RawResultExtended = collections.namedtuple("RawResultExtended",
-    ["unique_id", "start_top_log_probs", "start_top_index",
-     "end_top_log_probs", "end_top_index", "cls_logits"])
+RawResultExtended = collections.namedtuple(
+    "RawResultExtended",
+    ["unique_id", "start_top_log_probs", "start_top_index", "end_top_log_probs", "end_top_index", "cls_logits"],
+)
 
 
-def write_predictions_extended(all_examples, all_features, all_results, n_best_size,
-                                max_answer_length, output_prediction_file,
-                                output_nbest_file,
-                                output_null_log_odds_file, orig_data_file,
-                                start_n_top, end_n_top, version_2_with_negative,
-                                tokenizer, verbose_logging):
+def write_predictions_extended(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    orig_data_file,
+    start_n_top,
+    end_n_top,
+    version_2_with_negative,
+    tokenizer,
+    verbose_logging,
+):
     """ XLNet write prediction logic (more complex than Bert's).
         Write final predictions to the json file and log-odds of null if needed.
 
         Requires utils_squad_evaluate.py
     """
     _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction",
-        ["feature_index", "start_index", "end_index",
-        "start_log_prob", "end_log_prob"])
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
+    )
 
     _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"])
+        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]
+    )
 
     logger.info("Writing predictions to: %s", output_prediction_file)
     # logger.info("Writing nbest to: %s" % (output_nbest_file))
@@ -754,12 +773,13 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
                             start_index=start_index,
                             end_index=end_index,
                             start_log_prob=start_log_prob,
-                            end_log_prob=end_log_prob))
+                            end_log_prob=end_log_prob,
+                        )
+                    )
 
         prelim_predictions = sorted(
-            prelim_predictions,
-            key=lambda x: (x.start_log_prob + x.end_log_prob),
-            reverse=True)
+            prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True
+        )
 
         seen_predictions = {}
         nbest = []
@@ -770,7 +790,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
 
             # XLNet un-tokenizer
             # Let's keep it simple for now and see if we need all this later.
-            # 
+            #
             # tok_start_to_orig_index = feature.tok_start_to_orig_index
             # tok_end_to_orig_index = feature.tok_end_to_orig_index
             # start_orig_pos = tok_start_to_orig_index[pred.start_index]
@@ -779,10 +799,10 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
             # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
 
             # Previously used Bert untokenizer
-            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+            tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
             orig_doc_start = feature.token_to_orig_map[pred.start_index]
             orig_doc_end = feature.token_to_orig_map[pred.end_index]
-            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+            orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
             tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
 
             # Clean whitespace
@@ -790,8 +810,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
             tok_text = " ".join(tok_text.split())
             orig_text = " ".join(orig_tokens)
 
-            final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case,
-                                        verbose_logging)
+            final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case, verbose_logging)
 
             if final_text in seen_predictions:
                 continue
@@ -799,17 +818,13 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
             seen_predictions[final_text] = True
 
             nbest.append(
-                _NbestPrediction(
-                    text=final_text,
-                    start_log_prob=pred.start_log_prob,
-                    end_log_prob=pred.end_log_prob))
+                _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob)
+            )
 
         # In very rare edge cases we could have no valid predictions. So we
         # just create a nonce prediction in this case to avoid failure.
         if not nbest:
-            nbest.append(
-                _NbestPrediction(text="", start_log_prob=-1e6,
-                end_log_prob=-1e6))
+            nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6))
 
         total_scores = []
         best_non_null_entry = None
@@ -850,7 +865,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
         with open(output_null_log_odds_file, "w") as writer:
             writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
 
-    with open(orig_data_file, "r", encoding='utf-8') as reader:
+    with open(orig_data_file, "r", encoding="utf-8") as reader:
         orig_data = json.load(reader)["data"]
 
     qid_to_has_ans = make_qid_to_has_ans(orig_data)
@@ -914,8 +929,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
     start_position = tok_text.find(pred_text)
     if start_position == -1:
         if verbose_logging:
-            logger.info(
-                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+            logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
         return orig_text
     end_position = start_position + len(pred_text) - 1
 
@@ -924,8 +938,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
 
     if len(orig_ns_text) != len(tok_ns_text):
         if verbose_logging:
-            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
-                        orig_ns_text, tok_ns_text)
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
         return orig_text
 
     # We then project the characters in `pred_text` back to `orig_text` using
@@ -956,7 +969,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
             logger.info("Couldn't map end position")
         return orig_text
 
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
     return output_text
 
 
diff --git a/templates/adding_a_new_model/configuration_xxx.py b/templates/adding_a_new_model/configuration_xxx.py
index 12d69799a9..370fbb569f 100644
--- a/templates/adding_a_new_model/configuration_xxx.py
+++ b/templates/adding_a_new_model/configuration_xxx.py
@@ -27,8 +27,8 @@ from .configuration_utils import PretrainedConfig
 logger = logging.getLogger(__name__)
 
 XXX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-config.json",
-    'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-config.json",
+    "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-config.json",
+    "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-config.json",
 }
 
 
@@ -63,24 +63,26 @@ class XxxConfig(PretrainedConfig):
     """
     pretrained_config_archive_map = XXX_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-    def __init__(self,
-                 vocab_size=50257,
-                 n_positions=1024,
-                 n_ctx=1024,
-                 n_embd=768,
-                 n_layer=12,
-                 n_head=12,
-                 resid_pdrop=0.1,
-                 embd_pdrop=0.1,
-                 attn_pdrop=0.1,
-                 layer_norm_epsilon=1e-5,
-                 initializer_range=0.02,
-                 summary_type='cls_index',
-                 summary_use_proj=True,
-                 summary_activation=None,
-                 summary_proj_to_labels=True,
-                 summary_first_dropout=0.1,
-                 **kwargs):
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_positions=1024,
+        n_ctx=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        summary_type="cls_index",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        **kwargs
+    ):
         super(XxxConfig, self).__init__(**kwargs)
         self.vocab_size = vocab_size
         self.n_ctx = n_ctx
diff --git a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
index 9d389deaad..99d3761496 100755
--- a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
+++ b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
@@ -24,8 +24,10 @@ import torch
 from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
+
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
     # Initialise PyTorch model
     config = XxxConfig.from_json_file(config_file)
@@ -43,23 +45,19 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_du
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--tf_checkpoint_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the TensorFlow checkpoint path.")
-    parser.add_argument("--config_file",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "The config json file corresponding to the pre-trained model. \n"
-                            "This specifies the model architecture.")
-    parser.add_argument("--pytorch_dump_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output PyTorch model.")
+    parser.add_argument(
+        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the pre-trained model. \n"
+        "This specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
     args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                     args.config_file,
-                                     args.pytorch_dump_path)
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py
index 1783620998..a4477704ae 100644
--- a/templates/adding_a_new_model/modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
@@ -44,8 +44,8 @@ logger = logging.getLogger(__name__)
 # for the pretrained weights provided with the models
 ####################################################
 TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-tf_model.h5",
-    'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-tf_model.h5",
+    "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-tf_model.h5",
+    "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-tf_model.h5",
 }
 
 ####################################################
@@ -69,9 +69,9 @@ TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
 class TFXxxLayer(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFXxxLayer, self).__init__(**kwargs)
-        self.attention = TFXxxAttention(config, name='attention')
-        self.intermediate = TFXxxIntermediate(config, name='intermediate')
-        self.transformer_output = TFXxxOutput(config, name='output')
+        self.attention = TFXxxAttention(config, name="attention")
+        self.intermediate = TFXxxIntermediate(config, name="intermediate")
+        self.transformer_output = TFXxxOutput(config, name="output")
 
     def call(self, inputs, training=False):
         hidden_states, attention_mask, head_mask = inputs
@@ -98,7 +98,9 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
     def _prune_heads(self, heads_to_prune):
         raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
 
-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
+    def call(
+        self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False
+    ):
         # We allow three types of multi-inputs:
         # - traditional keyword arguments in the call method
         # - all the arguments provided as a dict in the first positional argument of call
@@ -113,11 +115,11 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
             head_mask = inputs[4] if len(inputs) > 4 else head_mask
             assert len(inputs) <= 5, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
             assert len(inputs) <= 5, "Too many inputs."
         else:
             input_ids = inputs
@@ -175,6 +177,7 @@ class TFXxxPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = XxxConfig
     pretrained_model_archive_map = TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
@@ -263,8 +266,12 @@ XXX_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare Xxx Model transformer outputing raw hidden-states without any specific head on top.",
-                      XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare Xxx Model transformer outputing raw hidden-states without any specific head on top.",
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class TFXxxModel(TFXxxPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -297,17 +304,19 @@ class TFXxxModel(TFXxxPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXxxModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.transformer = TFXxxMainLayer(config, name="transformer")
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
         return outputs
 
 
-@add_start_docstrings("""Xxx Model with a `language modeling` head on top. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Xxx Model with a `language modeling` head on top. """, XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING
+)
 class TFXxxForMaskedLM(TFXxxPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -333,26 +342,30 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel):
         prediction_scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXxxForMaskedLM, self).__init__(config, *inputs, **kwargs)
 
-        self.transformer = TFXxxMainLayer(config, name='transformer')
-        self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name='mlm')
+        self.transformer = TFXxxMainLayer(config, name="transformer")
+        self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name="mlm")
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
 
         sequence_output = outputs[0]
-        prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
+        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
 
         outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
 
         return outputs  # prediction_scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class TFXxxForSequenceClassification(TFXxxPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -378,22 +391,23 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel):
         logits = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXxxForSequenceClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.transformer = TFXxxMainLayer(config, name="transformer")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
 
         pooled_output = outputs[1]
 
-        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
+        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
         logits = self.classifier(pooled_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
@@ -401,9 +415,12 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel):
         return outputs  # logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Xxx Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Xxx Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class TFXxxForTokenClassification(TFXxxPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -429,22 +446,23 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel):
         scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXxxForTokenClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.transformer = TFXxxMainLayer(config, name="transformer")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
 
         sequence_output = outputs[0]
 
-        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
+        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
         logits = self.classifier(sequence_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
@@ -452,9 +470,12 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel):
         return outputs  # scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class TFXxxForQuestionAnswering(TFXxxPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -482,14 +503,15 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel):
         start_scores, end_scores = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXxxForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.transformer = TFXxxMainLayer(config, name='transformer')
-        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='qa_outputs')
+        self.transformer = TFXxxMainLayer(config, name="transformer")
+        self.qa_outputs = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py
index 4c325196eb..7270376ec7 100644
--- a/templates/adding_a_new_model/modeling_xxx.py
+++ b/templates/adding_a_new_model/modeling_xxx.py
@@ -44,8 +44,8 @@ logger = logging.getLogger(__name__)
 # for the pretrained weights provided with the models
 ####################################################
 XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-pytorch_model.bin",
-    'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-pytorch_model.bin",
+    "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-pytorch_model.bin",
+    "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-pytorch_model.bin",
 }
 
 ####################################################
@@ -60,8 +60,10 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise
     tf_path = os.path.abspath(tf_checkpoint_path)
     logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
@@ -76,7 +78,7 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
         arrays.append(array)
 
     for name, array in zip(names, arrays):
-        name = name.split('/')
+        name = name.split("/")
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
@@ -84,18 +86,18 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
             continue
         pointer = model
         for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                l = re.split(r"_(\d+)", m_name)
             else:
                 l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'squad':
-                pointer = getattr(pointer, 'classifier')
+            if l[0] == "kernel" or l[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif l[0] == "output_bias" or l[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif l[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif l[0] == "squad":
+                pointer = getattr(pointer, "classifier")
             else:
                 try:
                     pointer = getattr(pointer, l[0])
@@ -105,9 +107,9 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
             if len(l) >= 2:
                 num = int(l[1])
                 pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
             array = np.transpose(array)
         try:
             assert pointer.shape == array.shape
@@ -147,7 +149,6 @@ class XxxLayer(nn.Module):
         return outputs
 
 
-
 ####################################################
 # PreTrainedModel is a sub-class of torch.nn.Module
 # which take care of loading and saving pretrained weights
@@ -161,6 +162,7 @@ class XxxPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = XxxConfig
     pretrained_model_archive_map = XXX_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_xxx
@@ -246,8 +248,12 @@ XXX_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare Xxx Model transformer outputting raw hidden-states without any specific head on top.",
-                      XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare Xxx Model transformer outputting raw hidden-states without any specific head on top.",
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class XxxModel(XxxPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -277,6 +283,7 @@ class XxxModel(XxxPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config):
         super(XxxModel, self).__init__(config)
 
@@ -300,7 +307,15 @@ class XxxModel(XxxPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -329,7 +344,7 @@ class XxxModel(XxxPreTrainedModel):
         # positions we want to attend and -10000.0 for masked positions.
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 
         # Prepare head mask if needed
@@ -342,14 +357,20 @@ class XxxModel(XxxPreTrainedModel):
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.num_hidden_layers
 
         ##################################
         # Replace this with your model code
-        embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds)
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
         encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask)
         sequence_output = encoder_outputs[0]
         outputs = (sequence_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
@@ -357,8 +378,9 @@ class XxxModel(XxxPreTrainedModel):
         return outputs  # sequence_output, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Xxx Model with a `language modeling` head on top. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Xxx Model with a `language modeling` head on top. """, XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING
+)
 class XxxForMaskedLM(XxxPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -389,6 +411,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
         loss, prediction_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XxxForMaskedLM, self).__init__(config)
 
@@ -400,15 +423,25 @@ class XxxForMaskedLM(XxxPreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                masked_lm_labels=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        masked_lm_labels=None,
+    ):
 
-        outputs = self.transformer(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids, 
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output = outputs[0]
         prediction_scores = self.cls(sequence_output)
@@ -422,9 +455,12 @@ class XxxForMaskedLM(XxxPreTrainedModel):
         return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class XxxForSequenceClassification(XxxPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -456,6 +492,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XxxForSequenceClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -466,15 +503,25 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
 
-        outputs = self.transformer(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids, 
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         pooled_output = outputs[1]
 
@@ -496,9 +543,12 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
         return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Xxx Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Xxx Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class XxxForTokenClassification(XxxPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -528,6 +578,7 @@ class XxxForTokenClassification(XxxPreTrainedModel):
         loss, scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XxxForTokenClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -538,15 +589,25 @@ class XxxForTokenClassification(XxxPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
 
-        outputs = self.transformer(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids, 
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output = outputs[0]
 
@@ -569,9 +630,12 @@ class XxxForTokenClassification(XxxPreTrainedModel):
         return outputs  # (loss), scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class XxxForQuestionAnswering(XxxPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -613,6 +677,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
 
 
     """
+
     def __init__(self, config):
         super(XxxForQuestionAnswering, self).__init__(config)
         self.num_labels = config.num_labels
@@ -622,15 +687,26 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                start_positions=None, end_positions=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+    ):
 
-        outputs = self.transformer(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids, 
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output = outputs[0]
 
diff --git a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
index 6eba932a8e..1e4f64042a 100644
--- a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 import unittest
 import sys
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -27,46 +27,57 @@ from transformers import XxxConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_xxx import (TFXxxModel, TFXxxForMaskedLM,
-                                               TFXxxForSequenceClassification,
-                                               TFXxxForTokenClassification,
-                                               TFXxxForQuestionAnswering,
-                                               TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_xxx import (
+        TFXxxModel,
+        TFXxxForMaskedLM,
+        TFXxxForSequenceClassification,
+        TFXxxForTokenClassification,
+        TFXxxForQuestionAnswering,
+        TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
 
 @require_tf
 class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
 
-    all_model_classes = (TFXxxModel, TFXxxForMaskedLM, TFXxxForQuestionAnswering,
-                         TFXxxForSequenceClassification,
-                         TFXxxForTokenClassification) if is_tf_available() else ()
+    all_model_classes = (
+        (
+            TFXxxModel,
+            TFXxxForMaskedLM,
+            TFXxxForQuestionAnswering,
+            TFXxxForSequenceClassification,
+            TFXxxForTokenClassification,
+        )
+        if is_tf_available()
+        else ()
+    )
 
     class TFXxxModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -120,15 +131,16 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
                 attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                 max_position_embeddings=self.max_position_embeddings,
                 type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
             return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
-        def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFXxxModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             sequence_output, pooled_output = model(inputs)
 
             inputs = [input_ids, input_mask]
@@ -141,78 +153,74 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
                 "pooled_output": pooled_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
 
-
-        def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFXxxForMaskedLM(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            prediction_scores, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (prediction_scores,) = model(inputs)
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
-
-        def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = TFXxxForSequenceClassification(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            logits, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (logits,) = model(inputs)
             result = {
                 "logits": logits.numpy(),
             }
-            self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.num_labels])
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
 
-
-        def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = TFXxxForTokenClassification(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            logits, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (logits,) = model(inputs)
             result = {
                 "logits": logits.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.seq_length, self.num_labels])
+                list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
+            )
 
-
-        def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFXxxForQuestionAnswering(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             start_logits, end_logits = model(inputs)
             result = {
                 "start_logits": start_logits.numpy(),
                 "end_logits": end_logits.numpy(),
             }
-            self.parent.assertListEqual(
-                list(result["start_logits"].shape),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].shape),
-                [self.batch_size, self.seq_length])
-
+            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -244,9 +252,10 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in ['xxx-base-uncased']:
+        for model_name in ["xxx-base-uncased"]:
             model = TFXxxModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/templates/adding_a_new_model/tests/modeling_xxx_test.py b/templates/adding_a_new_model/tests/modeling_xxx_test.py
index 5e22392d00..2043d79655 100644
--- a/templates/adding_a_new_model/tests/modeling_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_xxx_test.py
@@ -20,51 +20,60 @@ import unittest
 
 from transformers import is_torch_available
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 if is_torch_available():
-    from transformers import (XxxConfig, XxxModel, XxxForMaskedLM,
-                                        XxxForNextSentencePrediction, XxxForPreTraining,
-                                        XxxForQuestionAnswering, XxxForSequenceClassification,
-                                        XxxForTokenClassification, XxxForMultipleChoice)
+    from transformers import (
+        XxxConfig,
+        XxxModel,
+        XxxForMaskedLM,
+        XxxForNextSentencePrediction,
+        XxxForPreTraining,
+        XxxForQuestionAnswering,
+        XxxForSequenceClassification,
+        XxxForTokenClassification,
+        XxxForMultipleChoice,
+    )
     from transformers.modeling_xxx import XXX_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
 @require_torch
 class XxxModelTest(CommonTestCases.CommonModelTester):
 
-    all_model_classes = (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering,
-                         XxxForSequenceClassification,
-                         XxxForTokenClassification) if is_torch_available() else ()
+    all_model_classes = (
+        (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering, XxxForSequenceClassification, XxxForTokenClassification)
+        if is_torch_available()
+        else ()
+    )
 
     class XxxModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -118,16 +127,17 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
                 attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                 max_position_embeddings=self.max_position_embeddings,
                 type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
             return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+            self.parent.assertListEqual(list(result["loss"].size()), [])
 
-        def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = XxxModel(config=config)
             model.to(torch_device)
             model.eval()
@@ -140,83 +150,98 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
                 "pooled_output": pooled_output,
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
 
-
-        def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = XxxForMaskedLM(config=config)
             model.to(torch_device)
             model.eval()
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
+            loss, prediction_scores = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels
+            )
             result = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.check_loss_output(result)
 
-
-        def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = XxxForQuestionAnswering(config=config)
             model.to(torch_device)
             model.eval()
-            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
-                                                   start_positions=sequence_labels, end_positions=sequence_labels)
+            loss, start_logits, end_logits = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+            )
             result = {
                 "loss": loss,
                 "start_logits": start_logits,
                 "end_logits": end_logits,
             }
-            self.parent.assertListEqual(
-                list(result["start_logits"].size()),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].size()),
-                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
             self.check_loss_output(result)
 
-
-        def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = XxxForSequenceClassification(config)
             model.to(torch_device)
             model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
+            )
             result = {
                 "loss": loss,
                 "logits": logits,
             }
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.num_labels])
+            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
             self.check_loss_output(result)
 
-
-        def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = XxxForTokenClassification(config=config)
             model.to(torch_device)
             model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
+            )
             result = {
                 "loss": loss,
                 "logits": logits,
             }
             self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.seq_length, self.num_labels])
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
+            )
             self.check_loss_output(result)
 
-
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -252,5 +277,6 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
             model = XxxModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/templates/adding_a_new_model/tests/tokenization_xxx_test.py b/templates/adding_a_new_model/tests/tokenization_xxx_test.py
index 116083edc8..940de5c769 100644
--- a/templates/adding_a_new_model/tests/tokenization_xxx_test.py
+++ b/templates/adding_a_new_model/tests/tokenization_xxx_test.py
@@ -18,10 +18,11 @@ import os
 import unittest
 from io import open
 
-from transformers.tokenization_bert import (XxxTokenizer, VOCAB_FILES_NAMES)
+from transformers.tokenization_bert import XxxTokenizer, VOCAB_FILES_NAMES
 
 from .tokenization_tests_commons import CommonTestCases
 
+
 class XxxTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
     tokenizer_class = XxxTokenizer
@@ -30,28 +31,39 @@ class XxxTokenizationTest(CommonTestCases.CommonTokenizerTester):
         super(XxxTokenizationTest, self).setUp()
 
         vocab_tokens = [
-            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
-            "##ing", ",", "low", "lowest",
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
         ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-        with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_tokenizer(self, **kwargs):
         return XxxTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"UNwant\u00E9d,running"
-        output_text = u"unwanted, running"
+        input_text = "UNwant\u00E9d,running"
+        output_text = "unwanted, running"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = self.tokenizer_class(self.vocab_file)
 
-        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
+        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/templates/adding_a_new_model/tokenization_xxx.py b/templates/adding_a_new_model/tokenization_xxx.py
index 7a10a41e5a..c1ea93a6d2 100644
--- a/templates/adding_a_new_model/tokenization_xxx.py
+++ b/templates/adding_a_new_model/tokenization_xxx.py
@@ -34,17 +34,16 @@ logger = logging.getLogger(__name__)
 # Mapping from the keyword arguments names of Tokenizer `__init__`
 # to file names for serializing Tokenizer instances
 ####################################################
-VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
 
 ####################################################
 # Mapping from the keyword arguments names of Tokenizer `__init__`
 # to pretrained vocabulary URL for all the model shortcut names.
 ####################################################
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-vocab.txt",
-        'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-vocab.txt",
+    "vocab_file": {
+        "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-vocab.txt",
+        "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-vocab.txt",
     }
 }
 
@@ -52,8 +51,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
 # Mapping from model shortcut names to max length of inputs
 ####################################################
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'xxx-base-uncased': 512,
-    'xxx-large-uncased': 512,
+    "xxx-base-uncased": 512,
+    "xxx-large-uncased": 512,
 }
 
 ####################################################
@@ -62,8 +61,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 # To be used for checkpoint specific configurations.
 ####################################################
 PRETRAINED_INIT_CONFIGURATION = {
-    'xxx-base-uncased': {'do_lower_case': True},
-    'xxx-large-uncased': {'do_lower_case': True},
+    "xxx-base-uncased": {"do_lower_case": True},
+    "xxx-large-uncased": {"do_lower_case": True},
 }
 
 
@@ -73,7 +72,7 @@ def load_vocab(vocab_file):
     with open(vocab_file, "r", encoding="utf-8") as reader:
         tokens = reader.readlines()
     for index, token in enumerate(tokens):
-        token = token.rstrip('\n')
+        token = token.rstrip("\n")
         vocab[token] = index
     return vocab
 
@@ -93,9 +92,17 @@ class XxxTokenizer(PreTrainedTokenizer):
     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, do_lower_case=True,
-                 unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
-                 mask_token="[MASK]", **kwargs):
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
         """Constructs a XxxTokenizer.
 
         Args:
@@ -104,16 +111,22 @@ class XxxTokenizer(PreTrainedTokenizer):
                 Whether to lower case the input
                 Only has an effect when do_basic_tokenize=True
         """
-        super(XxxTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
-                                           pad_token=pad_token, cls_token=cls_token,
-                                           mask_token=mask_token, **kwargs)
+        super(XxxTokenizer, self).__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs
+        )
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
 
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = XxxTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+                "model use `tokenizer = XxxTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
         self.vocab = load_vocab(vocab_file)
 
     @property
@@ -142,7 +155,7 @@ class XxxTokenizer(PreTrainedTokenizer):
 
     def convert_tokens_to_string(self, tokens):
         """ Converts a sequence of tokens (string) in a single string. """
-        out_string = ' '.join(tokens).replace(' ##', '').strip()
+        out_string = " ".join(tokens).replace(" ##", "").strip()
         return out_string
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
@@ -177,8 +190,10 @@ class XxxTokenizer(PreTrainedTokenizer):
 
         if already_has_special_tokens:
             if token_ids_1 is not None:
-                raise ValueError("You should not supply a second sequence if the provided sequence of "
-                                 "ids is already formated with special tokens for the model.")
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
         if token_ids_1 is not None:
@@ -204,15 +219,17 @@ class XxxTokenizer(PreTrainedTokenizer):
         """Save the tokenizer vocabulary to a directory or file."""
         index = 0
         if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
         else:
             vocab_file = vocab_path
         with open(vocab_file, "w", encoding="utf-8") as writer:
             for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                 if index != token_index:
-                    logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
-                                   " Please check that the vocabulary is not corrupted!".format(vocab_file))
+                    logger.warning(
+                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
+                    )
                     index = token_index
-                writer.write(token + u'\n')
+                writer.write(token + "\n")
                 index += 1
         return (vocab_file,)
diff --git a/transformers/__init__.py b/transformers/__init__.py
index 017fe476e7..318cd5ce4e 100755
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -6,8 +6,9 @@ __version__ = "2.3.0"
 # and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
 try:
     import absl.logging
-    absl.logging.set_verbosity('info')
-    absl.logging.set_stderrthreshold('info')
+
+    absl.logging.set_verbosity("info")
+    absl.logging.set_stderrthreshold("info")
     absl.logging._warn_preinit_stderr = False
 except:
     pass
@@ -17,19 +18,41 @@ import logging
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 # Files and general utilities
-from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
-                         cached_path, add_start_docstrings, add_end_docstrings,
-                         WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME, MODEL_CARD_NAME,
-                         is_tf_available, is_torch_available)
+from .file_utils import (
+    TRANSFORMERS_CACHE,
+    PYTORCH_TRANSFORMERS_CACHE,
+    PYTORCH_PRETRAINED_BERT_CACHE,
+    cached_path,
+    add_start_docstrings,
+    add_end_docstrings,
+    WEIGHTS_NAME,
+    TF2_WEIGHTS_NAME,
+    TF_WEIGHTS_NAME,
+    CONFIG_NAME,
+    MODEL_CARD_NAME,
+    is_tf_available,
+    is_torch_available,
+)
 
-from .data import (is_sklearn_available,
-                   InputExample, InputFeatures, DataProcessor,
-                   SingleSentenceClassificationProcessor,
-                   glue_output_modes, glue_convert_examples_to_features,
-                   glue_processors, glue_tasks_num_labels,
-                   xnli_output_modes, xnli_processors, xnli_tasks_num_labels,
-                   squad_convert_examples_to_features, SquadFeatures, 
-                   SquadExample, SquadV1Processor, SquadV2Processor)
+from .data import (
+    is_sklearn_available,
+    InputExample,
+    InputFeatures,
+    DataProcessor,
+    SingleSentenceClassificationProcessor,
+    glue_output_modes,
+    glue_convert_examples_to_features,
+    glue_processors,
+    glue_tasks_num_labels,
+    xnli_output_modes,
+    xnli_processors,
+    xnli_tasks_num_labels,
+    squad_convert_examples_to_features,
+    SquadFeatures,
+    SquadExample,
+    SquadV1Processor,
+    SquadV2Processor,
+)
 
 if is_sklearn_available():
     from .data import glue_compute_metrics, xnli_compute_metrics
@@ -38,12 +61,12 @@ if is_sklearn_available():
 from .modelcard import ModelCard
 
 # Tokenizers
-from .tokenization_utils import (PreTrainedTokenizer)
+from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_auto import AutoTokenizer
 from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_bert_japanese import BertJapaneseTokenizer, MecabTokenizer, CharacterTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
-from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
+from .tokenization_transfo_xl import TransfoXLTokenizer, TransfoXLCorpus
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_ctrl import CTRLTokenizer
 from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
@@ -75,143 +98,281 @@ from .configuration_mmbt import MMBTConfig
 
 # Modeling
 if is_torch_available():
-    from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
-    from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
-                                AutoModelWithLMHead, AutoModelForTokenClassification, ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_utils import PreTrainedModel, prune_layer, Conv1D
+    from .modeling_auto import (
+        AutoModel,
+        AutoModelForSequenceClassification,
+        AutoModelForQuestionAnswering,
+        AutoModelWithLMHead,
+        AutoModelForTokenClassification,
+        ALL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
-    from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
-                                BertForMaskedLM, BertForNextSentencePrediction,
-                                BertForSequenceClassification, BertForMultipleChoice,
-                                BertForTokenClassification, BertForQuestionAnswering,
-                                load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
-                                  OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
-                                  load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
-                                    AdaptiveEmbedding,
-                                    load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
-                                GPT2LMHeadModel, GPT2DoubleHeadsModel,
-                                load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_ctrl import (CTRLPreTrainedModel, CTRLModel,
-                                CTRLLMHeadModel,
-                                CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
-                                XLNetForSequenceClassification, XLNetForTokenClassification,
-                                XLNetForMultipleChoice, XLNetForQuestionAnsweringSimple,
-                                XLNetForQuestionAnswering, load_tf_weights_in_xlnet,
-                                XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
-                            XLMWithLMHeadModel, XLMForSequenceClassification,
-                            XLMForQuestionAnswering, XLMForQuestionAnsweringSimple,
-                            XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_roberta import (RobertaForMaskedLM, RobertaModel,
-                                RobertaForSequenceClassification, RobertaForMultipleChoice,
-                                RobertaForTokenClassification, RobertaForQuestionAnswering,
-                                ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_distilbert import (DistilBertPreTrainedModel, DistilBertForMaskedLM, DistilBertModel,
-                                DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
-                                DistilBertForTokenClassification,
-                                DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_camembert import (CamembertForMaskedLM, CamembertModel,
-                                CamembertForSequenceClassification, CamembertForMultipleChoice,
-                                CamembertForTokenClassification,
-                                CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_bert import (
+        BertPreTrainedModel,
+        BertModel,
+        BertForPreTraining,
+        BertForMaskedLM,
+        BertForNextSentencePrediction,
+        BertForSequenceClassification,
+        BertForMultipleChoice,
+        BertForTokenClassification,
+        BertForQuestionAnswering,
+        load_tf_weights_in_bert,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_openai import (
+        OpenAIGPTPreTrainedModel,
+        OpenAIGPTModel,
+        OpenAIGPTLMHeadModel,
+        OpenAIGPTDoubleHeadsModel,
+        load_tf_weights_in_openai_gpt,
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_transfo_xl import (
+        TransfoXLPreTrainedModel,
+        TransfoXLModel,
+        TransfoXLLMHeadModel,
+        AdaptiveEmbedding,
+        load_tf_weights_in_transfo_xl,
+        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_gpt2 import (
+        GPT2PreTrainedModel,
+        GPT2Model,
+        GPT2LMHeadModel,
+        GPT2DoubleHeadsModel,
+        load_tf_weights_in_gpt2,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_ctrl import CTRLPreTrainedModel, CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
+    from .modeling_xlnet import (
+        XLNetPreTrainedModel,
+        XLNetModel,
+        XLNetLMHeadModel,
+        XLNetForSequenceClassification,
+        XLNetForTokenClassification,
+        XLNetForMultipleChoice,
+        XLNetForQuestionAnsweringSimple,
+        XLNetForQuestionAnswering,
+        load_tf_weights_in_xlnet,
+        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_xlm import (
+        XLMPreTrainedModel,
+        XLMModel,
+        XLMWithLMHeadModel,
+        XLMForSequenceClassification,
+        XLMForQuestionAnswering,
+        XLMForQuestionAnsweringSimple,
+        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_roberta import (
+        RobertaForMaskedLM,
+        RobertaModel,
+        RobertaForSequenceClassification,
+        RobertaForMultipleChoice,
+        RobertaForTokenClassification,
+        RobertaForQuestionAnswering,
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_distilbert import (
+        DistilBertPreTrainedModel,
+        DistilBertForMaskedLM,
+        DistilBertModel,
+        DistilBertForSequenceClassification,
+        DistilBertForQuestionAnswering,
+        DistilBertForTokenClassification,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_camembert import (
+        CamembertForMaskedLM,
+        CamembertModel,
+        CamembertForSequenceClassification,
+        CamembertForMultipleChoice,
+        CamembertForTokenClassification,
+        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
     from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
-    from .modeling_t5 import (T5PreTrainedModel, T5Model, T5WithLMHeadModel,
-                              load_tf_weights_in_t5,
-                              T5_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_albert import (AlbertPreTrainedModel, AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification,
-                                AlbertForQuestionAnswering,
-                                load_tf_weights_in_albert, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_xlm_roberta import (XLMRobertaForMaskedLM, XLMRobertaModel, XLMRobertaForMultipleChoice,
-                                       XLMRobertaForSequenceClassification, XLMRobertaForTokenClassification)
+    from .modeling_t5 import (
+        T5PreTrainedModel,
+        T5Model,
+        T5WithLMHeadModel,
+        load_tf_weights_in_t5,
+        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_albert import (
+        AlbertPreTrainedModel,
+        AlbertModel,
+        AlbertForMaskedLM,
+        AlbertForSequenceClassification,
+        AlbertForQuestionAnswering,
+        load_tf_weights_in_albert,
+        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_xlm_roberta import (
+        XLMRobertaForMaskedLM,
+        XLMRobertaModel,
+        XLMRobertaForMultipleChoice,
+        XLMRobertaForSequenceClassification,
+        XLMRobertaForTokenClassification,
+    )
     from .modeling_mmbt import ModalEmbeddings, MMBTModel, MMBTForClassification
 
     # Optimization
-    from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup,
-                               get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup)
+    from .optimization import (
+        AdamW,
+        get_constant_schedule,
+        get_constant_schedule_with_warmup,
+        get_cosine_schedule_with_warmup,
+        get_cosine_with_hard_restarts_schedule_with_warmup,
+        get_linear_schedule_with_warmup,
+    )
 
 
 # TensorFlow
 if is_tf_available():
     from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list
-    from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
-                                   TFAutoModelWithLMHead, TFAutoModelForTokenClassification, TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_tf_auto import (
+        TFAutoModel,
+        TFAutoModelForSequenceClassification,
+        TFAutoModelForQuestionAnswering,
+        TFAutoModelWithLMHead,
+        TFAutoModelForTokenClassification,
+        TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
-    from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings,
-                                   TFBertModel, TFBertForPreTraining,
-                                   TFBertForMaskedLM, TFBertForNextSentencePrediction,
-                                   TFBertForSequenceClassification, TFBertForMultipleChoice,
-                                   TFBertForTokenClassification, TFBertForQuestionAnswering,
-                                   TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_tf_bert import (
+        TFBertPreTrainedModel,
+        TFBertMainLayer,
+        TFBertEmbeddings,
+        TFBertModel,
+        TFBertForPreTraining,
+        TFBertForMaskedLM,
+        TFBertForNextSentencePrediction,
+        TFBertForSequenceClassification,
+        TFBertForMultipleChoice,
+        TFBertForTokenClassification,
+        TFBertForQuestionAnswering,
+        TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
-    from .modeling_tf_gpt2 import (TFGPT2PreTrainedModel, TFGPT2MainLayer,
-                                   TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel,
-                                   TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_tf_gpt2 import (
+        TFGPT2PreTrainedModel,
+        TFGPT2MainLayer,
+        TFGPT2Model,
+        TFGPT2LMHeadModel,
+        TFGPT2DoubleHeadsModel,
+        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
-    from .modeling_tf_openai import (TFOpenAIGPTPreTrainedModel, TFOpenAIGPTMainLayer,
-                                     TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel,
-                                     TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_tf_openai import (
+        TFOpenAIGPTPreTrainedModel,
+        TFOpenAIGPTMainLayer,
+        TFOpenAIGPTModel,
+        TFOpenAIGPTLMHeadModel,
+        TFOpenAIGPTDoubleHeadsModel,
+        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
-    from .modeling_tf_transfo_xl import (TFTransfoXLPreTrainedModel, TFTransfoXLMainLayer,
-                                         TFTransfoXLModel, TFTransfoXLLMHeadModel,
-                                         TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_tf_transfo_xl import (
+        TFTransfoXLPreTrainedModel,
+        TFTransfoXLMainLayer,
+        TFTransfoXLModel,
+        TFTransfoXLLMHeadModel,
+        TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
-    from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
-                                    TFXLNetModel, TFXLNetLMHeadModel,
-                                    TFXLNetForSequenceClassification,
-                                    TFXLNetForTokenClassification,
-                                    TFXLNetForQuestionAnsweringSimple,
-                                    TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_tf_xlnet import (
+        TFXLNetPreTrainedModel,
+        TFXLNetMainLayer,
+        TFXLNetModel,
+        TFXLNetLMHeadModel,
+        TFXLNetForSequenceClassification,
+        TFXLNetForTokenClassification,
+        TFXLNetForQuestionAnsweringSimple,
+        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
-    from .modeling_tf_xlm import (TFXLMPreTrainedModel, TFXLMMainLayer,
-                                  TFXLMModel, TFXLMWithLMHeadModel,
-                                  TFXLMForSequenceClassification,
-                                  TFXLMForQuestionAnsweringSimple,
-                                  TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_tf_xlm import (
+        TFXLMPreTrainedModel,
+        TFXLMMainLayer,
+        TFXLMModel,
+        TFXLMWithLMHeadModel,
+        TFXLMForSequenceClassification,
+        TFXLMForQuestionAnsweringSimple,
+        TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
-    from .modeling_tf_roberta import (TFRobertaPreTrainedModel, TFRobertaMainLayer,
-                                      TFRobertaModel, TFRobertaForMaskedLM,
-                                      TFRobertaForSequenceClassification,
-                                      TFRobertaForTokenClassification,
-                                      TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_tf_roberta import (
+        TFRobertaPreTrainedModel,
+        TFRobertaMainLayer,
+        TFRobertaModel,
+        TFRobertaForMaskedLM,
+        TFRobertaForSequenceClassification,
+        TFRobertaForTokenClassification,
+        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
-    from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
-                                         TFDistilBertModel, TFDistilBertForMaskedLM,
-                                         TFDistilBertForSequenceClassification,
-                                         TFDistilBertForTokenClassification,
-                                         TFDistilBertForQuestionAnswering,
-                                         TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_tf_distilbert import (
+        TFDistilBertPreTrainedModel,
+        TFDistilBertMainLayer,
+        TFDistilBertModel,
+        TFDistilBertForMaskedLM,
+        TFDistilBertForSequenceClassification,
+        TFDistilBertForTokenClassification,
+        TFDistilBertForQuestionAnswering,
+        TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
-    from .modeling_tf_ctrl import (TFCTRLPreTrainedModel, TFCTRLModel,
-                                    TFCTRLLMHeadModel,
-                                    TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_tf_ctrl import (
+        TFCTRLPreTrainedModel,
+        TFCTRLModel,
+        TFCTRLLMHeadModel,
+        TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
-    from .modeling_tf_albert import (TFAlbertPreTrainedModel, TFAlbertModel, TFAlbertForMaskedLM,
-                                     TFAlbertForSequenceClassification,
-                                    TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_tf_albert import (
+        TFAlbertPreTrainedModel,
+        TFAlbertModel,
+        TFAlbertForMaskedLM,
+        TFAlbertForSequenceClassification,
+        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
-    from .modeling_tf_t5 import (TFT5PreTrainedModel, TFT5Model, TFT5WithLMHeadModel,
-                                 TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_tf_t5 import TFT5PreTrainedModel, TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
 
     # Optimization
-    from .optimization_tf import (WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator)
+    from .optimization_tf import WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator
 
 # TF 2.0 <=> PyTorch conversion utilities
-from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
-                                        load_pytorch_checkpoint_in_tf2_model,
-                                        load_pytorch_weights_in_tf2_model,
-                                        load_pytorch_model_in_tf2_model,
-                                        load_tf2_checkpoint_in_pytorch_model,
-                                        load_tf2_weights_in_pytorch_model,
-                                        load_tf2_model_in_pytorch_model)
+from .modeling_tf_pytorch_utils import (
+    convert_tf_weight_name_to_pt_weight_name,
+    load_pytorch_checkpoint_in_tf2_model,
+    load_pytorch_weights_in_tf2_model,
+    load_pytorch_model_in_tf2_model,
+    load_tf2_checkpoint_in_pytorch_model,
+    load_tf2_weights_in_pytorch_model,
+    load_tf2_model_in_pytorch_model,
+)
 
 # Pipelines
-from .pipelines import pipeline, PipelineDataFormat, CsvPipelineDataFormat, JsonPipelineDataFormat, PipedPipelineDataFormat, \
-    Pipeline, FeatureExtractionPipeline, QuestionAnsweringPipeline, NerPipeline, TextClassificationPipeline
+from .pipelines import (
+    pipeline,
+    PipelineDataFormat,
+    CsvPipelineDataFormat,
+    JsonPipelineDataFormat,
+    PipedPipelineDataFormat,
+    Pipeline,
+    FeatureExtractionPipeline,
+    QuestionAnsweringPipeline,
+    NerPipeline,
+    TextClassificationPipeline,
+)
 
 if not is_tf_available() and not is_torch_available():
-    logger.warning("Neither PyTorch nor TensorFlow >= 2.0 have been found."
-                   "Models won't be available and only tokenizers, configuration"
-                   "and file/data utilities can be used.")
+    logger.warning(
+        "Neither PyTorch nor TensorFlow >= 2.0 have been found."
+        "Models won't be available and only tokenizers, configuration"
+        "and file/data utilities can be used."
+    )
diff --git a/transformers/__main__.py b/transformers/__main__.py
index dd259b04ee..3cabdd4fff 100644
--- a/transformers/__main__.py
+++ b/transformers/__main__.py
@@ -1,16 +1,21 @@
 # coding: utf8
 
+
 def main():
     import sys
+
     if len(sys.argv) < 2 or sys.argv[1] not in ["convert", "train", "predict", "serve"]:
         print(
-        "First argument to `transformers` command line interface should be one of: \n"
-        ">> convert serve train predict")
+            "First argument to `transformers` command line interface should be one of: \n"
+            ">> convert serve train predict"
+        )
     if sys.argv[1] == "convert":
         from transformers.commands import convert
+
         convert(sys.argv)
     elif sys.argv[1] == "train":
         from transformers.commands import train
+
         train(sys.argv)
     elif sys.argv[1] == "serve":
         pass
@@ -19,7 +24,6 @@ def main():
         # parser = ArgumentParser('Transformers CLI tool', usage='transformers serve <command> [<args>]')
         # commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
 
-
         # # Register commands
         # ServeCommand.register_subcommand(commands_parser)
 
@@ -33,5 +37,6 @@ def main():
         # service = args.func(args)
         # service.run()
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/transformers/commands/__init__.py b/transformers/commands/__init__.py
index bbdd5655fc..13171f4285 100644
--- a/transformers/commands/__init__.py
+++ b/transformers/commands/__init__.py
@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
 from argparse import ArgumentParser
 
+
 class BaseTransformersCLICommand(ABC):
     @staticmethod
     @abstractmethod
diff --git a/transformers/commands/convert.py b/transformers/commands/convert.py
index 55dbf53734..e358d8532f 100644
--- a/transformers/commands/convert.py
+++ b/transformers/commands/convert.py
@@ -11,12 +11,12 @@ def convert_command_factory(args: Namespace):
     Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint.
     :return: ServeCommand
     """
-    return ConvertCommand(args.model_type, args.tf_checkpoint, args.pytorch_dump_output,
-                          args.config, args.finetuning_task_name)
+    return ConvertCommand(
+        args.model_type, args.tf_checkpoint, args.pytorch_dump_output, args.config, args.finetuning_task_name
+    )
 
 
 class ConvertCommand(BaseTransformersCLICommand):
-
     @staticmethod
     def register_subcommand(parser: ArgumentParser):
         """
@@ -24,25 +24,39 @@ class ConvertCommand(BaseTransformersCLICommand):
         :param parser: Root parser to register command-specific arguments
         :return:
         """
-        train_parser = parser.add_parser('convert', help="CLI tool to run convert model from original "
-                                                         "author checkpoints to Transformesr PyTorch checkpoints.")
-        train_parser.add_argument('--model_type', type=str, required=True,
-                                  help='Model\'s type.')
-        train_parser.add_argument('--tf_checkpoint', type=str, required=True,
-                                  help='TensorFlow checkpoint path or folder.')
-        train_parser.add_argument('--pytorch_dump_output', type=str, required=True,
-                                  help='Path to the PyTorch savd model output.')
-        train_parser.add_argument('--config', type=str, default="",
-                                  help='Configuration file path or folder.')
-        train_parser.add_argument('--finetuning_task_name', type=str, default=None,
-                                  help='Optional fine-tuning task name if the TF model was a finetuned model.')
+        train_parser = parser.add_parser(
+            "convert",
+            help="CLI tool to run convert model from original "
+            "author checkpoints to Transformesr PyTorch checkpoints.",
+        )
+        train_parser.add_argument("--model_type", type=str, required=True, help="Model's type.")
+        train_parser.add_argument(
+            "--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder."
+        )
+        train_parser.add_argument(
+            "--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch savd model output."
+        )
+        train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.")
+        train_parser.add_argument(
+            "--finetuning_task_name",
+            type=str,
+            default=None,
+            help="Optional fine-tuning task name if the TF model was a finetuned model.",
+        )
         train_parser.set_defaults(func=convert_command_factory)
 
-    def __init__(self, model_type: str, tf_checkpoint: str, pytorch_dump_output: str,
-                 config: str, finetuning_task_name: str, *args):
-        self._logger = getLogger('transformers-cli/converting')
+    def __init__(
+        self,
+        model_type: str,
+        tf_checkpoint: str,
+        pytorch_dump_output: str,
+        config: str,
+        finetuning_task_name: str,
+        *args
+    ):
+        self._logger = getLogger("transformers-cli/converting")
 
-        self._logger.info('Loading model {}'.format(model_type))
+        self._logger.info("Loading model {}".format(model_type))
         self._model_type = model_type
         self._tf_checkpoint = tf_checkpoint
         self._pytorch_dump_output = pytorch_dump_output
@@ -52,63 +66,80 @@ class ConvertCommand(BaseTransformersCLICommand):
     def run(self):
         if self._model_type == "bert":
             try:
-                from transformers.convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+                from transformers.convert_bert_original_tf_checkpoint_to_pytorch import (
+                    convert_tf_checkpoint_to_pytorch,
+                )
             except ImportError:
-                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
-                    "In that case, it requires TensorFlow to be installed. Please see " \
+                msg = (
+                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions."
+                )
                 raise ImportError(msg)
 
             convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
         elif self._model_type == "gpt":
-            from transformers.convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
-            convert_openai_checkpoint_to_pytorch(self._tf_checkpoint,
-                                                    self._config,
-                                                    self._pytorch_dump_output)
+            from transformers.convert_openai_original_tf_checkpoint_to_pytorch import (
+                convert_openai_checkpoint_to_pytorch,
+            )
+
+            convert_openai_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
         elif self._model_type == "transfo_xl":
             try:
-                from transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
+                from transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch import (
+                    convert_transfo_xl_checkpoint_to_pytorch,
+                )
             except ImportError:
-                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
-                    "In that case, it requires TensorFlow to be installed. Please see " \
+                msg = (
+                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions."
+                )
                 raise ImportError(msg)
 
-            if 'ckpt' in self._tf_checkpoint.lower():
+            if "ckpt" in self._tf_checkpoint.lower():
                 TF_CHECKPOINT = self._tf_checkpoint
                 TF_DATASET_FILE = ""
             else:
                 TF_DATASET_FILE = self._tf_checkpoint
                 TF_CHECKPOINT = ""
-            convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT,
-                                                        self._config,
-                                                        self._pytorch_dump_output,
-                                                        TF_DATASET_FILE)
+            convert_transfo_xl_checkpoint_to_pytorch(
+                TF_CHECKPOINT, self._config, self._pytorch_dump_output, TF_DATASET_FILE
+            )
         elif self._model_type == "gpt2":
             try:
-                from transformers.convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
+                from transformers.convert_gpt2_original_tf_checkpoint_to_pytorch import (
+                    convert_gpt2_checkpoint_to_pytorch,
+                )
             except ImportError:
-                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
-                    "In that case, it requires TensorFlow to be installed. Please see " \
+                msg = (
+                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions."
+                )
                 raise ImportError(msg)
 
             convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
         elif self._model_type == "xlnet":
             try:
-                from transformers.convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
+                from transformers.convert_xlnet_original_tf_checkpoint_to_pytorch import (
+                    convert_xlnet_checkpoint_to_pytorch,
+                )
             except ImportError:
-                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
-                    "In that case, it requires TensorFlow to be installed. Please see " \
+                msg = (
+                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions."
+                )
                 raise ImportError(msg)
 
-            convert_xlnet_checkpoint_to_pytorch(self._tf_checkpoint,
-                                                self._config,
-                                                self._pytorch_dump_output,
-                                                self._finetuning_task_name)
+            convert_xlnet_checkpoint_to_pytorch(
+                self._tf_checkpoint, self._config, self._pytorch_dump_output, self._finetuning_task_name
+            )
         elif self._model_type == "xlm":
-            from transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
+            from transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch import (
+                convert_xlm_checkpoint_to_pytorch,
+            )
 
             convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
         else:
diff --git a/transformers/commands/download.py b/transformers/commands/download.py
index 0938f135d2..acfb3eeb92 100644
--- a/transformers/commands/download.py
+++ b/transformers/commands/download.py
@@ -8,13 +8,16 @@ def download_command_factory(args):
 
 
 class DownloadCommand(BaseTransformersCLICommand):
-
     @staticmethod
     def register_subcommand(parser: ArgumentParser):
-        download_parser = parser.add_parser('download')
-        download_parser.add_argument('--cache-dir', type=str, default=None, help='Path to location to store the models')
-        download_parser.add_argument('--force',  action='store_true', help='Force the model to be download even if already in cache-dir')
-        download_parser.add_argument('model', type=str, help='Name of the model to download')
+        download_parser = parser.add_parser("download")
+        download_parser.add_argument(
+            "--cache-dir", type=str, default=None, help="Path to location to store the models"
+        )
+        download_parser.add_argument(
+            "--force", action="store_true", help="Force the model to be download even if already in cache-dir"
+        )
+        download_parser.add_argument("model", type=str, help="Name of the model to download")
         download_parser.set_defaults(func=download_command_factory)
 
     def __init__(self, model: str, cache: str, force: bool):
@@ -26,4 +29,4 @@ class DownloadCommand(BaseTransformersCLICommand):
         from transformers import AutoModel, AutoTokenizer
 
         AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
-        AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
\ No newline at end of file
+        AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
diff --git a/transformers/commands/run.py b/transformers/commands/run.py
index df03cee9d7..6172263064 100644
--- a/transformers/commands/run.py
+++ b/transformers/commands/run.py
@@ -10,52 +10,72 @@ logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 def try_infer_format_from_ext(path: str):
     if not path:
-        return 'pipe'
+        return "pipe"
 
     for ext in PipelineDataFormat.SUPPORTED_FORMATS:
         if path.endswith(ext):
             return ext
 
     raise Exception(
-        'Unable to determine file format from file extension {}. '
-        'Please provide the format through --format {}'.format(path, PipelineDataFormat.SUPPORTED_FORMATS)
+        "Unable to determine file format from file extension {}. "
+        "Please provide the format through --format {}".format(path, PipelineDataFormat.SUPPORTED_FORMATS)
     )
 
 
 def run_command_factory(args):
-    nlp = pipeline(task=args.task,
-                   model=args.model if args.model else None,
-                   config=args.config,
-                   tokenizer=args.tokenizer,
-                   device=args.device)
-    format = try_infer_format_from_ext(args.input) if args.format == 'infer' else args.format
-    reader = PipelineDataFormat.from_str(format=format,
-                                         output_path=args.output,
-                                         input_path=args.input,
-                                         column=args.column if args.column else nlp.default_input_names,
-                                         overwrite=args.overwrite)
+    nlp = pipeline(
+        task=args.task,
+        model=args.model if args.model else None,
+        config=args.config,
+        tokenizer=args.tokenizer,
+        device=args.device,
+    )
+    format = try_infer_format_from_ext(args.input) if args.format == "infer" else args.format
+    reader = PipelineDataFormat.from_str(
+        format=format,
+        output_path=args.output,
+        input_path=args.input,
+        column=args.column if args.column else nlp.default_input_names,
+        overwrite=args.overwrite,
+    )
     return RunCommand(nlp, reader)
 
 
 class RunCommand(BaseTransformersCLICommand):
-
     def __init__(self, nlp: Pipeline, reader: PipelineDataFormat):
         self._nlp = nlp
         self._reader = reader
 
     @staticmethod
     def register_subcommand(parser: ArgumentParser):
-        run_parser = parser.add_parser('run', help="Run a pipeline through the CLI")
-        run_parser.add_argument('--task', choices=SUPPORTED_TASKS.keys(), help='Task to run')
-        run_parser.add_argument('--input', type=str, help='Path to the file to use for inference')
-        run_parser.add_argument('--output', type=str, help='Path to the file that will be used post to write results.')
-        run_parser.add_argument('--model', type=str, help='Name or path to the model to instantiate.')
-        run_parser.add_argument('--config', type=str, help='Name or path to the model\'s config to instantiate.')
-        run_parser.add_argument('--tokenizer', type=str, help='Name of the tokenizer to use. (default: same as the model name)')
-        run_parser.add_argument('--column', type=str, help='Name of the column to use as input. (For multi columns input as QA use column1,columns2)')
-        run_parser.add_argument('--format', type=str, default='infer', choices=PipelineDataFormat.SUPPORTED_FORMATS, help='Input format to read from')
-        run_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)')
-        run_parser.add_argument('--overwrite', action='store_true', help='Allow overwriting the output file.')
+        run_parser = parser.add_parser("run", help="Run a pipeline through the CLI")
+        run_parser.add_argument("--task", choices=SUPPORTED_TASKS.keys(), help="Task to run")
+        run_parser.add_argument("--input", type=str, help="Path to the file to use for inference")
+        run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.")
+        run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.")
+        run_parser.add_argument("--config", type=str, help="Name or path to the model's config to instantiate.")
+        run_parser.add_argument(
+            "--tokenizer", type=str, help="Name of the tokenizer to use. (default: same as the model name)"
+        )
+        run_parser.add_argument(
+            "--column",
+            type=str,
+            help="Name of the column to use as input. (For multi columns input as QA use column1,columns2)",
+        )
+        run_parser.add_argument(
+            "--format",
+            type=str,
+            default="infer",
+            choices=PipelineDataFormat.SUPPORTED_FORMATS,
+            help="Input format to read from",
+        )
+        run_parser.add_argument(
+            "--device",
+            type=int,
+            default=-1,
+            help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)",
+        )
+        run_parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the output file.")
         run_parser.set_defaults(func=run_command_factory)
 
     def run(self):
@@ -71,9 +91,6 @@ class RunCommand(BaseTransformersCLICommand):
         # Saving data
         if self._nlp.binary_output:
             binary_path = self._reader.save_binary(outputs)
-            logger.warning('Current pipeline requires output to be in binary format, saving at {}'.format(binary_path))
+            logger.warning("Current pipeline requires output to be in binary format, saving at {}".format(binary_path))
         else:
             self._reader.save(outputs)
-
-
-
diff --git a/transformers/commands/serving.py b/transformers/commands/serving.py
index 4f41f797d1..f7729c0bf0 100644
--- a/transformers/commands/serving.py
+++ b/transformers/commands/serving.py
@@ -7,6 +7,7 @@ try:
     from uvicorn import run
     from fastapi import FastAPI, HTTPException, Body
     from pydantic import BaseModel
+
     _serve_dependancies_installed = True
 except (ImportError, AttributeError):
     BaseModel = object
@@ -17,18 +18,21 @@ from transformers import Pipeline
 from transformers.commands import BaseTransformersCLICommand
 from transformers.pipelines import SUPPORTED_TASKS, pipeline
 
-logger = logging.getLogger('transformers-cli/serving')
+logger = logging.getLogger("transformers-cli/serving")
+
 
 def serve_command_factory(args: Namespace):
     """
     Factory function used to instantiate serving server from provided command line arguments.
     :return: ServeCommand
     """
-    nlp = pipeline(task=args.task,
-                   model=args.model if args.model else None,
-                   config=args.config,
-                   tokenizer=args.tokenizer,
-                   device=args.device)
+    nlp = pipeline(
+        task=args.task,
+        model=args.model if args.model else None,
+        config=args.config,
+        tokenizer=args.tokenizer,
+        device=args.device,
+    )
     return ServeCommand(nlp, args.host, args.port)
 
 
@@ -36,6 +40,7 @@ class ServeModelInfoResult(BaseModel):
     """
     Expose model information
     """
+
     infos: dict
 
 
@@ -43,6 +48,7 @@ class ServeTokenizeResult(BaseModel):
     """
     Tokenize result model
     """
+
     tokens: List[str]
     tokens_ids: Optional[List[int]]
 
@@ -51,6 +57,7 @@ class ServeDeTokenizeResult(BaseModel):
     """
     DeTokenize result model
     """
+
     text: str
 
 
@@ -58,11 +65,11 @@ class ServeForwardResult(BaseModel):
     """
     Forward result model
     """
+
     output: Any
 
 
 class ServeCommand(BaseTransformersCLICommand):
-
     @staticmethod
     def register_subcommand(parser: ArgumentParser):
         """
@@ -70,14 +77,23 @@ class ServeCommand(BaseTransformersCLICommand):
         :param parser: Root parser to register command-specific arguments
         :return:
         """
-        serve_parser = parser.add_parser('serve', help='CLI tool to run inference requests through REST and GraphQL endpoints.')
-        serve_parser.add_argument('--task', type=str, choices=SUPPORTED_TASKS.keys(), help='The task to run the pipeline on')
-        serve_parser.add_argument('--host', type=str, default='localhost', help='Interface the server will listen on.')
-        serve_parser.add_argument('--port', type=int, default=8888, help='Port the serving will listen to.')
-        serve_parser.add_argument('--model', type=str, help='Model\'s name or path to stored model.')
-        serve_parser.add_argument('--config', type=str, help='Model\'s config name or path to stored model.')
-        serve_parser.add_argument('--tokenizer', type=str, help='Tokenizer name to use.')
-        serve_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)')
+        serve_parser = parser.add_parser(
+            "serve", help="CLI tool to run inference requests through REST and GraphQL endpoints."
+        )
+        serve_parser.add_argument(
+            "--task", type=str, choices=SUPPORTED_TASKS.keys(), help="The task to run the pipeline on"
+        )
+        serve_parser.add_argument("--host", type=str, default="localhost", help="Interface the server will listen on.")
+        serve_parser.add_argument("--port", type=int, default=8888, help="Port the serving will listen to.")
+        serve_parser.add_argument("--model", type=str, help="Model's name or path to stored model.")
+        serve_parser.add_argument("--config", type=str, help="Model's config name or path to stored model.")
+        serve_parser.add_argument("--tokenizer", type=str, help="Tokenizer name to use.")
+        serve_parser.add_argument(
+            "--device",
+            type=int,
+            default=-1,
+            help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)",
+        )
         serve_parser.set_defaults(func=serve_command_factory)
 
     def __init__(self, pipeline: Pipeline, host: str, port: int):
@@ -87,18 +103,22 @@ class ServeCommand(BaseTransformersCLICommand):
         self._host = host
         self._port = port
         if not _serve_dependancies_installed:
-            raise ImportError("Using serve command requires FastAPI and unicorn. "
-                                "Please install transformers with [serving]: pip install transformers[serving]." 
-                                "Or install FastAPI and unicorn separatly.")
+            raise ImportError(
+                "Using serve command requires FastAPI and unicorn. "
+                "Please install transformers with [serving]: pip install transformers[serving]."
+                "Or install FastAPI and unicorn separatly."
+            )
         else:
-            logger.info('Serving model over {}:{}'.format(host, port))
+            logger.info("Serving model over {}:{}".format(host, port))
             self._app = FastAPI()
 
             # Register routes
-            self._app.add_api_route('/', self.model_info, response_model=ServeModelInfoResult, methods=['GET'])
-            self._app.add_api_route('/tokenize', self.tokenize, response_model=ServeTokenizeResult, methods=['POST'])
-            self._app.add_api_route('/detokenize', self.detokenize, response_model=ServeDeTokenizeResult, methods=['POST'])
-            self._app.add_api_route('/forward', self.forward, response_model=ServeForwardResult, methods=['POST'])
+            self._app.add_api_route("/", self.model_info, response_model=ServeModelInfoResult, methods=["GET"])
+            self._app.add_api_route("/tokenize", self.tokenize, response_model=ServeTokenizeResult, methods=["POST"])
+            self._app.add_api_route(
+                "/detokenize", self.detokenize, response_model=ServeDeTokenizeResult, methods=["POST"]
+            )
+            self._app.add_api_route("/forward", self.forward, response_model=ServeForwardResult, methods=["POST"])
 
     def run(self):
         run(self._app, host=self._host, port=self._port)
@@ -122,11 +142,14 @@ class ServeCommand(BaseTransformersCLICommand):
                 return ServeTokenizeResult(tokens=tokens_txt)
 
         except Exception as e:
-            raise HTTPException(status_code=500, detail={"model": '', "error": str(e)})
+            raise HTTPException(status_code=500, detail={"model": "", "error": str(e)})
 
-    def detokenize(self, tokens_ids: List[int] = Body(None, embed=True),
-                   skip_special_tokens: bool = Body(False, embed=True),
-                   cleanup_tokenization_spaces: bool = Body(True, embed=True)):
+    def detokenize(
+        self,
+        tokens_ids: List[int] = Body(None, embed=True),
+        skip_special_tokens: bool = Body(False, embed=True),
+        cleanup_tokenization_spaces: bool = Body(True, embed=True),
+    ):
         """
         Detokenize the provided tokens ids to readable text:
         - **tokens_ids**: List of tokens ids
@@ -135,9 +158,9 @@ class ServeCommand(BaseTransformersCLICommand):
         """
         try:
             decoded_str = self._pipeline.tokenizer.decode(tokens_ids, skip_special_tokens, cleanup_tokenization_spaces)
-            return ServeDeTokenizeResult(model='', text=decoded_str)
+            return ServeDeTokenizeResult(model="", text=decoded_str)
         except Exception as e:
-            raise HTTPException(status_code=500, detail={"model": '', "error": str(e)})
+            raise HTTPException(status_code=500, detail={"model": "", "error": str(e)})
 
     def forward(self, inputs: Union[str, dict, List[str], List[int], List[dict]] = Body(None, embed=True)):
         """
diff --git a/transformers/commands/train.py b/transformers/commands/train.py
index 7b26745881..e51be71c75 100644
--- a/transformers/commands/train.py
+++ b/transformers/commands/train.py
@@ -3,9 +3,12 @@ from argparse import ArgumentParser, Namespace
 from logging import getLogger
 
 from transformers.commands import BaseTransformersCLICommand
-from transformers import (is_tf_available, is_torch_available,
-                          TextClassificationPipeline,
-                          SingleSentenceClassificationProcessor as Processor)
+from transformers import (
+    is_tf_available,
+    is_torch_available,
+    TextClassificationPipeline,
+    SingleSentenceClassificationProcessor as Processor,
+)
 
 if not is_tf_available() and not is_torch_available():
     raise ImportError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training")
@@ -14,6 +17,7 @@ if not is_tf_available() and not is_torch_available():
 USE_XLA = False
 USE_AMP = False
 
+
 def train_command_factory(args: Namespace):
     """
     Factory function used to instantiate serving server from provided command line arguments.
@@ -23,7 +27,6 @@ def train_command_factory(args: Namespace):
 
 
 class TrainCommand(BaseTransformersCLICommand):
-
     @staticmethod
     def register_subcommand(parser: ArgumentParser):
         """
@@ -31,47 +34,54 @@ class TrainCommand(BaseTransformersCLICommand):
         :param parser: Root parser to register command-specific arguments
         :return:
         """
-        train_parser = parser.add_parser('train', help='CLI tool to train a model on a task.')
+        train_parser = parser.add_parser("train", help="CLI tool to train a model on a task.")
 
-        train_parser.add_argument('--train_data', type=str, required=True,
-                                  help="path to train (and optionally evaluation) dataset as a csv with "
-                                       "tab separated labels and sentences.")
-        train_parser.add_argument('--column_label', type=int, default=0,
-                                  help='Column of the dataset csv file with example labels.')
-        train_parser.add_argument('--column_text', type=int, default=1,
-                                  help='Column of the dataset csv file with example texts.')
-        train_parser.add_argument('--column_id', type=int, default=2,
-                                  help='Column of the dataset csv file with example ids.')
-        train_parser.add_argument('--skip_first_row', action='store_true',
-                                  help='Skip the first row of the csv file (headers).')
+        train_parser.add_argument(
+            "--train_data",
+            type=str,
+            required=True,
+            help="path to train (and optionally evaluation) dataset as a csv with "
+            "tab separated labels and sentences.",
+        )
+        train_parser.add_argument(
+            "--column_label", type=int, default=0, help="Column of the dataset csv file with example labels."
+        )
+        train_parser.add_argument(
+            "--column_text", type=int, default=1, help="Column of the dataset csv file with example texts."
+        )
+        train_parser.add_argument(
+            "--column_id", type=int, default=2, help="Column of the dataset csv file with example ids."
+        )
+        train_parser.add_argument(
+            "--skip_first_row", action="store_true", help="Skip the first row of the csv file (headers)."
+        )
 
-        train_parser.add_argument('--validation_data', type=str, default='',
-                                  help='path to validation dataset.')
-        train_parser.add_argument('--validation_split', type=float, default=0.1,
-                                  help="if validation dataset is not provided, fraction of train dataset "
-                                       "to use as validation dataset.")
+        train_parser.add_argument("--validation_data", type=str, default="", help="path to validation dataset.")
+        train_parser.add_argument(
+            "--validation_split",
+            type=float,
+            default=0.1,
+            help="if validation dataset is not provided, fraction of train dataset " "to use as validation dataset.",
+        )
 
-        train_parser.add_argument('--output', type=str, default='./',
-                                  help='path to saved the trained model.')
+        train_parser.add_argument("--output", type=str, default="./", help="path to saved the trained model.")
 
-        train_parser.add_argument('--task', type=str, default='text_classification',
-                                  help='Task to train the model on.')
-        train_parser.add_argument('--model', type=str, default='bert-base-uncased',
-                                  help='Model\'s name or path to stored model.')
-        train_parser.add_argument('--train_batch_size', type=int, default=32,
-                                  help='Batch size for training.')
-        train_parser.add_argument('--valid_batch_size', type=int, default=64,
-                                  help='Batch size for validation.')
-        train_parser.add_argument('--learning_rate', type=float, default=3e-5,
-                                  help="Learning rate.")
-        train_parser.add_argument('--adam_epsilon', type=float, default=1e-08,
-                                  help="Epsilon for Adam optimizer.")
+        train_parser.add_argument(
+            "--task", type=str, default="text_classification", help="Task to train the model on."
+        )
+        train_parser.add_argument(
+            "--model", type=str, default="bert-base-uncased", help="Model's name or path to stored model."
+        )
+        train_parser.add_argument("--train_batch_size", type=int, default=32, help="Batch size for training.")
+        train_parser.add_argument("--valid_batch_size", type=int, default=64, help="Batch size for validation.")
+        train_parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate.")
+        train_parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon for Adam optimizer.")
         train_parser.set_defaults(func=train_command_factory)
 
     def __init__(self, args: Namespace):
-        self.logger = getLogger('transformers-cli/training')
+        self.logger = getLogger("transformers-cli/training")
 
-        self.framework = 'tf' if is_tf_available() else 'torch'
+        self.framework = "tf" if is_tf_available() else "torch"
 
         os.makedirs(args.output, exist_ok=True)
         assert os.path.isdir(args.output)
@@ -81,28 +91,32 @@ class TrainCommand(BaseTransformersCLICommand):
         self.column_text = args.column_text
         self.column_id = args.column_id
 
-        self.logger.info('Loading {} pipeline for {}'.format(args.task, args.model))
-        if args.task == 'text_classification':
+        self.logger.info("Loading {} pipeline for {}".format(args.task, args.model))
+        if args.task == "text_classification":
             self.pipeline = TextClassificationPipeline.from_pretrained(args.model)
-        elif args.task == 'token_classification':
+        elif args.task == "token_classification":
             raise NotImplementedError
-        elif args.task == 'question_answering':
+        elif args.task == "question_answering":
             raise NotImplementedError
 
-        self.logger.info('Loading dataset from {}'.format(args.train_data))
-        self.train_dataset = Processor.create_from_csv(args.train_data,
-                                                       column_label=args.column_label,
-                                                       column_text=args.column_text,
-                                                       column_id=args.column_id,
-                                                       skip_first_row=args.skip_first_row)
+        self.logger.info("Loading dataset from {}".format(args.train_data))
+        self.train_dataset = Processor.create_from_csv(
+            args.train_data,
+            column_label=args.column_label,
+            column_text=args.column_text,
+            column_id=args.column_id,
+            skip_first_row=args.skip_first_row,
+        )
         self.valid_dataset = None
         if args.validation_data:
-            self.logger.info('Loading validation dataset from {}'.format(args.validation_data))
-            self.valid_dataset = Processor.create_from_csv(args.validation_data,
-                                                           column_label=args.column_label,
-                                                           column_text=args.column_text,
-                                                           column_id=args.column_id,
-                                                           skip_first_row=args.skip_first_row)
+            self.logger.info("Loading validation dataset from {}".format(args.validation_data))
+            self.valid_dataset = Processor.create_from_csv(
+                args.validation_data,
+                column_label=args.column_label,
+                column_text=args.column_text,
+                column_id=args.column_id,
+                skip_first_row=args.skip_first_row,
+            )
 
         self.validation_split = args.validation_split
         self.train_batch_size = args.train_batch_size
@@ -111,7 +125,7 @@ class TrainCommand(BaseTransformersCLICommand):
         self.adam_epsilon = args.adam_epsilon
 
     def run(self):
-        if self.framework == 'tf':
+        if self.framework == "tf":
             return self.run_tf()
         return self.run_torch()
 
@@ -119,13 +133,15 @@ class TrainCommand(BaseTransformersCLICommand):
         raise NotImplementedError
 
     def run_tf(self):
-        self.pipeline.fit(self.train_dataset,
-                          validation_data=self.valid_dataset,
-                          validation_split=self.validation_split,
-                          learning_rate=self.learning_rate,
-                          adam_epsilon=self.adam_epsilon,
-                          train_batch_size=self.train_batch_size,
-                          valid_batch_size=self.valid_batch_size)
+        self.pipeline.fit(
+            self.train_dataset,
+            validation_data=self.valid_dataset,
+            validation_split=self.validation_split,
+            learning_rate=self.learning_rate,
+            adam_epsilon=self.adam_epsilon,
+            train_batch_size=self.train_batch_size,
+            valid_batch_size=self.valid_batch_size,
+        )
 
         # Save trained pipeline
         self.pipeline.save_pretrained(self.output)
diff --git a/transformers/commands/user.py b/transformers/commands/user.py
index 8e0e563422..d29867d7c8 100644
--- a/transformers/commands/user.py
+++ b/transformers/commands/user.py
@@ -9,28 +9,31 @@ from transformers.hf_api import HfApi, HfFolder, HTTPError
 class UserCommands(BaseTransformersCLICommand):
     @staticmethod
     def register_subcommand(parser: ArgumentParser):
-        login_parser = parser.add_parser('login')
+        login_parser = parser.add_parser("login")
         login_parser.set_defaults(func=lambda args: LoginCommand(args))
-        whoami_parser = parser.add_parser('whoami')
+        whoami_parser = parser.add_parser("whoami")
         whoami_parser.set_defaults(func=lambda args: WhoamiCommand(args))
-        logout_parser = parser.add_parser('logout')
+        logout_parser = parser.add_parser("logout")
         logout_parser.set_defaults(func=lambda args: LogoutCommand(args))
-        list_parser = parser.add_parser('ls')
+        list_parser = parser.add_parser("ls")
         list_parser.set_defaults(func=lambda args: ListObjsCommand(args))
         # upload
-        upload_parser = parser.add_parser('upload')
-        upload_parser.add_argument('path', type=str, help='Local path of the folder or individual file to upload.')
-        upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override individual object filename on S3.')
+        upload_parser = parser.add_parser("upload")
+        upload_parser.add_argument("path", type=str, help="Local path of the folder or individual file to upload.")
+        upload_parser.add_argument(
+            "--filename", type=str, default=None, help="Optional: override individual object filename on S3."
+        )
         upload_parser.set_defaults(func=lambda args: UploadCommand(args))
 
 
-
 class ANSI:
     """
     Helper for en.wikipedia.org/wiki/ANSI_escape_code
     """
+
     _bold = u"\u001b[1m"
     _reset = u"\u001b[0m"
+
     @classmethod
     def bold(cls, s):
         return "{}{}{}".format(cls._bold, s, cls._reset)
@@ -44,14 +47,16 @@ class BaseUserCommand:
 
 class LoginCommand(BaseUserCommand):
     def run(self):
-        print("""
+        print(
+            """
         _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|  
         _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|        
         _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|    
         _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|        
         _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|  
 
-        """)
+        """
+        )
         username = input("Username: ")
         password = getpass()
         try:
@@ -101,16 +106,10 @@ class ListObjsCommand(BaseUserCommand):
         col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)]
         row_format = ("{{:{}}} " * len(headers)).format(*col_widths)
         lines = []
-        lines.append(
-            row_format.format(*headers)
-        )
-        lines.append(
-            row_format.format(*["-" * w for w in col_widths])
-        )
+        lines.append(row_format.format(*headers))
+        lines.append(row_format.format(*["-" * w for w in col_widths]))
         for row in rows:
-            lines.append(
-                row_format.format(*row)
-            )
+            lines.append(row_format.format(*row))
         return "\n".join(lines)
 
     def run(self):
@@ -126,15 +125,8 @@ class ListObjsCommand(BaseUserCommand):
         if len(objs) == 0:
             print("No shared file yet")
             exit()
-        rows = [ [
-            obj.filename,
-            obj.LastModified,
-            obj.ETag,
-            obj.Size
-        ] for obj in objs ]
-        print(
-            self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"])
-        )
+        rows = [[obj.filename, obj.LastModified, obj.ETag, obj.Size] for obj in objs]
+        print(self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"]))
 
 
 class UploadCommand(BaseUserCommand):
@@ -143,13 +135,7 @@ class UploadCommand(BaseUserCommand):
         Recursively list all files in a folder.
         """
         entries: List[os.DirEntry] = list(os.scandir(rel_path))
-        files = [
-            (
-                os.path.join(os.getcwd(), f.path),  # filepath
-                f.path  # filename
-            )
-            for f in entries if f.is_file()
-        ]
+        files = [(os.path.join(os.getcwd(), f.path), f.path) for f in entries if f.is_file()]  # filepath  # filename
         for f in entries:
             if f.is_dir():
                 files += self.walk_dir(f.path)
@@ -173,22 +159,14 @@ class UploadCommand(BaseUserCommand):
             raise ValueError("Not a valid file or directory: {}".format(local_path))
 
         for filepath, filename in files:
-            print(
-                "About to upload file {} to S3 under filename {}".format(
-                    ANSI.bold(filepath), ANSI.bold(filename)
-                )
-            )
+            print("About to upload file {} to S3 under filename {}".format(ANSI.bold(filepath), ANSI.bold(filename)))
 
         choice = input("Proceed? [Y/n] ").lower()
-        if not(choice == "" or choice == "y" or choice == "yes"):
+        if not (choice == "" or choice == "y" or choice == "yes"):
             print("Abort")
             exit()
-        print(
-            ANSI.bold("Uploading... This might take a while if files are large")
-        )
+        print(ANSI.bold("Uploading... This might take a while if files are large"))
         for filepath, filename in files:
-            access_url = self._api.presign_and_upload(
-                token=token, filename=filename, filepath=filepath
-            )
+            access_url = self._api.presign_and_upload(token=token, filename=filename, filepath=filepath)
             print("Your file now lives at:")
             print(access_url)
diff --git a/transformers/configuration_albert.py b/transformers/configuration_albert.py
index 6a1ef78dd5..dc2b74a29c 100644
--- a/transformers/configuration_albert.py
+++ b/transformers/configuration_albert.py
@@ -18,16 +18,17 @@
 from .configuration_utils import PretrainedConfig
 
 ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json",
-    'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json",
-    'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json",
-    'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json",
-    'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json",
-    'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json",
-    'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json",
-    'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json",
+    "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json",
+    "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json",
+    "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json",
+    "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json",
+    "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json",
+    "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json",
+    "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json",
+    "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json",
 }
 
+
 class AlbertConfig(PretrainedConfig):
     """Configuration for `AlbertModel`.
 
@@ -36,22 +37,25 @@ class AlbertConfig(PretrainedConfig):
 
     pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-    def __init__(self,
-                 vocab_size=30000,
-                 embedding_size=128,
-                 hidden_size=4096,
-                 num_hidden_layers=12,
-                 num_hidden_groups=1,
-                 num_attention_heads=64,
-                 intermediate_size=16384,
-                 inner_group_num=1,
-                 hidden_act="gelu_new",
-                 hidden_dropout_prob=0,
-                 attention_probs_dropout_prob=0,
-                 max_position_embeddings=512,
-                 type_vocab_size=2,
-                 initializer_range=0.02,
-                 layer_norm_eps=1e-12, **kwargs):
+    def __init__(
+        self,
+        vocab_size=30000,
+        embedding_size=128,
+        hidden_size=4096,
+        num_hidden_layers=12,
+        num_hidden_groups=1,
+        num_attention_heads=64,
+        intermediate_size=16384,
+        inner_group_num=1,
+        hidden_act="gelu_new",
+        hidden_dropout_prob=0,
+        attention_probs_dropout_prob=0,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        **kwargs
+    ):
         """Constructs AlbertConfig.
 
         Args:
diff --git a/transformers/configuration_auto.py b/transformers/configuration_auto.py
index 281256389e..e4311fc285 100644
--- a/transformers/configuration_auto.py
+++ b/transformers/configuration_auto.py
@@ -35,7 +35,8 @@ from .configuration_xlm_roberta import XLMRobertaConfig, XLM_ROBERTA_PRETRAINED_
 logger = logging.getLogger(__name__)
 
 
-ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict((key, value)
+ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
+    (key, value)
     for pretrained_map in [
         BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -50,8 +51,9 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict((key, value)
         CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
         XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        ]
-    for key, value, in pretrained_map.items())
+    ]
+    for key, value, in pretrained_map.items()
+)
 
 
 class AutoConfig(object):
@@ -79,37 +81,42 @@ class AutoConfig(object):
             - contains `ctrl` : CTRLConfig (CTRL model)
         This class cannot be instantiated using `__init__()` (throw an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("AutoConfig is designed to be instantiated "
-            "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.")
+        raise EnvironmentError(
+            "AutoConfig is designed to be instantiated "
+            "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method."
+        )
 
     @classmethod
     def for_model(cls, model_type, *args, **kwargs):
-        if 'distilbert' in model_type:
+        if "distilbert" in model_type:
             return DistilBertConfig(*args, **kwargs)
-        elif 'roberta' in model_type:
+        elif "roberta" in model_type:
             return RobertaConfig(*args, **kwargs)
-        elif 'bert' in model_type:
+        elif "bert" in model_type:
             return BertConfig(*args, **kwargs)
-        elif 'openai-gpt' in model_type:
+        elif "openai-gpt" in model_type:
             return OpenAIGPTConfig(*args, **kwargs)
-        elif 'gpt2' in model_type:
+        elif "gpt2" in model_type:
             return GPT2Config(*args, **kwargs)
-        elif 'transfo-xl' in model_type:
+        elif "transfo-xl" in model_type:
             return TransfoXLConfig(*args, **kwargs)
-        elif 'xlnet' in model_type:
+        elif "xlnet" in model_type:
             return XLNetConfig(*args, **kwargs)
-        elif 'xlm' in model_type:
+        elif "xlm" in model_type:
             return XLMConfig(*args, **kwargs)
-        elif 'ctrl' in model_type:
+        elif "ctrl" in model_type:
             return CTRLConfig(*args, **kwargs)
-        elif 'albert' in model_type:
+        elif "albert" in model_type:
             return AlbertConfig(*args, **kwargs)
-        elif 'camembert' in model_type:
+        elif "camembert" in model_type:
             return CamembertConfig(*args, **kwargs)
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta', 'ctrl', 'camembert', 'albert'".format(model_type))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+            "'xlm', 'roberta', 'ctrl', 'camembert', 'albert'".format(model_type)
+        )
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
@@ -176,32 +183,36 @@ class AutoConfig(object):
             assert unused_kwargs == {'foo': False}
 
         """
-        if 't5' in pretrained_model_name_or_path:
+        if "t5" in pretrained_model_name_or_path:
             return T5Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
+        elif "distilbert" in pretrained_model_name_or_path:
             return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
             return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'camembert' in pretrained_model_name_or_path:
+        elif "camembert" in pretrained_model_name_or_path:
             return CamembertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'xlm-roberta' in pretrained_model_name_or_path:
+        elif "xlm-roberta" in pretrained_model_name_or_path:
             return XLMRobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        elif "roberta" in pretrained_model_name_or_path:
             return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
             return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'openai-gpt' in pretrained_model_name_or_path:
+        elif "openai-gpt" in pretrained_model_name_or_path:
             return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'gpt2' in pretrained_model_name_or_path:
+        elif "gpt2" in pretrained_model_name_or_path:
             return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'transfo-xl' in pretrained_model_name_or_path:
+        elif "transfo-xl" in pretrained_model_name_or_path:
             return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
             return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'ctrl' in pretrained_model_name_or_path:
+        elif "ctrl" in pretrained_model_name_or_path:
             return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+            "'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(
+                pretrained_model_name_or_path
+            )
+        )
diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py
index 7b495013ff..7c5ee434a4 100644
--- a/transformers/configuration_bert.py
+++ b/transformers/configuration_bert.py
@@ -27,27 +27,27 @@ from .configuration_utils import PretrainedConfig
 logger = logging.getLogger(__name__)
 
 BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
-    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
-    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
-    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
-    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
-    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
-    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
-    'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
-    'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
-    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
-    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
-    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
-    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
-    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
-    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
+    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
+    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
+    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
+    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
+    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
+    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
+    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
+    "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
+    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
+    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
+    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
+    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
+    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
+    "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
+    "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
+    "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
+    "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
+    "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
+    "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
+    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
+    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
 }
 
 
@@ -82,20 +82,22 @@ class BertConfig(PretrainedConfig):
     """
     pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-    def __init__(self,
-                 vocab_size=30522,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 intermediate_size=3072,
-                 hidden_act="gelu",
-                 hidden_dropout_prob=0.1,
-                 attention_probs_dropout_prob=0.1,
-                 max_position_embeddings=512,
-                 type_vocab_size=2,
-                 initializer_range=0.02,
-                 layer_norm_eps=1e-12,
-                 **kwargs):
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        **kwargs
+    ):
         super(BertConfig, self).__init__(**kwargs)
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
diff --git a/transformers/configuration_camembert.py b/transformers/configuration_camembert.py
index 3ff64454e5..9aa641aa5f 100644
--- a/transformers/configuration_camembert.py
+++ b/transformers/configuration_camembert.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """ CamemBERT configuration """
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 
@@ -25,7 +24,7 @@ from .configuration_roberta import RobertaConfig
 logger = logging.getLogger(__name__)
 
 CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json",
+    "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json",
 }
 
 
diff --git a/transformers/configuration_ctrl.py b/transformers/configuration_ctrl.py
index f9b9e409e1..2726727d48 100644
--- a/transformers/configuration_ctrl.py
+++ b/transformers/configuration_ctrl.py
@@ -27,6 +27,7 @@ logger = logging.getLogger(__name__)
 
 CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-ctrl/pytorch/ctrl-config.json"}
 
+
 class CTRLConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `CTRLModel`.
 
@@ -48,6 +49,7 @@ class CTRLConfig(PretrainedConfig):
         initializer_range: The sttdev of the truncated_normal_initializer for
             initializing all weight matrices.
     """
+
     pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(
@@ -64,7 +66,7 @@ class CTRLConfig(PretrainedConfig):
         attn_pdrop=0.1,
         layer_norm_epsilon=1e-6,
         initializer_range=0.02,
-        summary_type='cls_index',
+        summary_type="cls_index",
         summary_use_proj=True,
         summary_activation=None,
         summary_proj_to_labels=True,
diff --git a/transformers/configuration_distilbert.py b/transformers/configuration_distilbert.py
index d9f7cc6348..120cbfb9f2 100644
--- a/transformers/configuration_distilbert.py
+++ b/transformers/configuration_distilbert.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ DistilBERT model configuration """
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import sys
 import json
@@ -26,32 +25,34 @@ from .configuration_utils import PretrainedConfig
 logger = logging.getLogger(__name__)
 
 DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
-    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json",
-    'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json",
-    'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json",
+    "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
+    "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json",
+    "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json",
+    "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json",
 }
 
 
 class DistilBertConfig(PretrainedConfig):
     pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-    def __init__(self,
-                 vocab_size=30522,
-                 max_position_embeddings=512,
-                 sinusoidal_pos_embds=False,
-                 n_layers=6,
-                 n_heads=12,
-                 dim=768,
-                 hidden_dim=4*768,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 activation='gelu',
-                 initializer_range=0.02,
-                 tie_weights_=True,
-                 qa_dropout=0.1,
-                 seq_classif_dropout=0.2,
-                 **kwargs):
+    def __init__(
+        self,
+        vocab_size=30522,
+        max_position_embeddings=512,
+        sinusoidal_pos_embds=False,
+        n_layers=6,
+        n_heads=12,
+        dim=768,
+        hidden_dim=4 * 768,
+        dropout=0.1,
+        attention_dropout=0.1,
+        activation="gelu",
+        initializer_range=0.02,
+        tie_weights_=True,
+        qa_dropout=0.1,
+        seq_classif_dropout=0.2,
+        **kwargs
+    ):
         super(DistilBertConfig, self).__init__(**kwargs)
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/transformers/configuration_gpt2.py b/transformers/configuration_gpt2.py
index 4c200c0760..adc8842edc 100644
--- a/transformers/configuration_gpt2.py
+++ b/transformers/configuration_gpt2.py
@@ -26,11 +26,14 @@ from .configuration_utils import PretrainedConfig
 
 logger = logging.getLogger(__name__)
 
-GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
-                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
-                                      "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json",
-                                      "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-config.json",
-                                      "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",}
+GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
+    "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
+    "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json",
+    "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-config.json",
+    "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",
+}
+
 
 class GPT2Config(PretrainedConfig):
     """Configuration class to store the configuration of a `GPT2Model`.
@@ -52,6 +55,7 @@ class GPT2Config(PretrainedConfig):
         initializer_range: The sttdev of the truncated_normal_initializer for
             initializing all weight matrices.
     """
+
     pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(
@@ -67,7 +71,7 @@ class GPT2Config(PretrainedConfig):
         attn_pdrop=0.1,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
-        summary_type='cls_index',
+        summary_type="cls_index",
         summary_use_proj=True,
         summary_activation=None,
         summary_proj_to_labels=True,
diff --git a/transformers/configuration_mmbt.py b/transformers/configuration_mmbt.py
index 60176c9872..5dad2babef 100644
--- a/transformers/configuration_mmbt.py
+++ b/transformers/configuration_mmbt.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """ MMBT configuration """
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 
@@ -31,6 +30,7 @@ class MMBTConfig(object):
         num_labels: Size of final Linear layer for classification.
         modal_hidden_size: Embedding dimension of the non-text modality encoder.
     """
+
     def __init__(self, config, num_labels=None, modal_hidden_size=2048):
         self.__dict__ = config.__dict__
         self.modal_hidden_size = modal_hidden_size
diff --git a/transformers/configuration_openai.py b/transformers/configuration_openai.py
index 7776a0bb9f..53929aab5f 100644
--- a/transformers/configuration_openai.py
+++ b/transformers/configuration_openai.py
@@ -30,6 +30,7 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"
 }
 
+
 class OpenAIGPTConfig(PretrainedConfig):
     """
     Configuration class to store the configuration of a `OpenAIGPTModel`.
@@ -54,6 +55,7 @@ class OpenAIGPTConfig(PretrainedConfig):
             initializing all weight matrices.
         predict_special_tokens: should we predict special tokens (when the model has a LM head)
     """
+
     pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(
@@ -71,7 +73,7 @@ class OpenAIGPTConfig(PretrainedConfig):
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
         predict_special_tokens=True,
-        summary_type='cls_index',
+        summary_type="cls_index",
         summary_use_proj=True,
         summary_activation=None,
         summary_proj_to_labels=True,
diff --git a/transformers/configuration_roberta.py b/transformers/configuration_roberta.py
index 842edac56e..3b8ddd1c46 100644
--- a/transformers/configuration_roberta.py
+++ b/transformers/configuration_roberta.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """ RoBERTa configuration """
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 
@@ -25,12 +24,12 @@ from .configuration_bert import BertConfig
 logger = logging.getLogger(__name__)
 
 ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
-    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
-    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
-    'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json",
-    'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json",
-    'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json",
+    "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
+    "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
+    "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
+    "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json",
+    "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json",
+    "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json",
 }
 
 
diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py
index 377a0919d9..4584015e27 100644
--- a/transformers/configuration_t5.py
+++ b/transformers/configuration_t5.py
@@ -27,11 +27,11 @@ from .configuration_utils import PretrainedConfig
 logger = logging.getLogger(__name__)
 
 T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
-    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json",
-    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json",
-    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json",
-    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json",
+    "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
+    "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json",
+    "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json",
+    "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json",
+    "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json",
 }
 
 
@@ -65,19 +65,21 @@ class T5Config(PretrainedConfig):
     """
     pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-    def __init__(self,
-                 vocab_size=32128,
-                 n_positions=512,
-                 d_model=512,
-                 d_kv=64,
-                 d_ff=2048,
-                 num_layers=6,
-                 num_heads=8,
-                 relative_attention_num_buckets=32,
-                 dropout_rate=0.1,
-                 layer_norm_epsilon=1e-6,
-                 initializer_factor=1.0,
-                 **kwargs):
+    def __init__(
+        self,
+        vocab_size=32128,
+        n_positions=512,
+        d_model=512,
+        d_kv=64,
+        d_ff=2048,
+        num_layers=6,
+        num_heads=8,
+        relative_attention_num_buckets=32,
+        dropout_rate=0.1,
+        layer_norm_epsilon=1e-6,
+        initializer_factor=1.0,
+        **kwargs
+    ):
         super(T5Config, self).__init__(**kwargs)
         self.vocab_size = vocab_size
         self.n_positions = n_positions
diff --git a/transformers/configuration_transfo_xl.py b/transformers/configuration_transfo_xl.py
index 52f0f45a50..a2a7c5c02d 100644
--- a/transformers/configuration_transfo_xl.py
+++ b/transformers/configuration_transfo_xl.py
@@ -27,9 +27,10 @@ from .configuration_utils import PretrainedConfig
 logger = logging.getLogger(__name__)
 
 TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
+    "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
 }
 
+
 class TransfoXLConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `TransfoXLModel`.
 
@@ -65,38 +66,41 @@ class TransfoXLConfig(PretrainedConfig):
             proj_init_std: parameters initialized by N(0, init_std)
             init_std: parameters initialized by N(0, init_std)
     """
+
     pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-    def __init__(self,
-                 vocab_size=267735,
-                 cutoffs=[20000, 40000, 200000],
-                 d_model=1024,
-                 d_embed=1024,
-                 n_head=16,
-                 d_head=64,
-                 d_inner=4096,
-                 div_val=4,
-                 pre_lnorm=False,
-                 n_layer=18,
-                 tgt_len=128,
-                 ext_len=0,
-                 mem_len=1600,
-                 clamp_len=1000,
-                 same_length=True,
-                 proj_share_all_but_first=True,
-                 attn_type=0,
-                 sample_softmax=-1,
-                 adaptive=True,
-                 tie_weight=True,
-                 dropout=0.1,
-                 dropatt=0.0,
-                 untie_r=True,
-                 init="normal",
-                 init_range=0.01,
-                 proj_init_std=0.01,
-                 init_std=0.02,
-                 layer_norm_epsilon=1e-5,
-                 **kwargs):
+    def __init__(
+        self,
+        vocab_size=267735,
+        cutoffs=[20000, 40000, 200000],
+        d_model=1024,
+        d_embed=1024,
+        n_head=16,
+        d_head=64,
+        d_inner=4096,
+        div_val=4,
+        pre_lnorm=False,
+        n_layer=18,
+        tgt_len=128,
+        ext_len=0,
+        mem_len=1600,
+        clamp_len=1000,
+        same_length=True,
+        proj_share_all_but_first=True,
+        attn_type=0,
+        sample_softmax=-1,
+        adaptive=True,
+        tie_weight=True,
+        dropout=0.1,
+        dropatt=0.0,
+        untie_r=True,
+        init="normal",
+        init_range=0.01,
+        proj_init_std=0.01,
+        init_std=0.02,
+        layer_norm_epsilon=1e-5,
+        **kwargs
+    ):
         """Constructs TransfoXLConfig.
         """
         super(TransfoXLConfig, self).__init__(**kwargs)
diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py
index d2d6ee5d80..f29899175c 100644
--- a/transformers/configuration_utils.py
+++ b/transformers/configuration_utils.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """ Configuration base class and utilities."""
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import copy
 import json
@@ -28,6 +27,7 @@ from .file_utils import CONFIG_NAME, cached_path, is_remote_url, hf_bucket_url
 
 logger = logging.getLogger(__name__)
 
+
 class PretrainedConfig(object):
     r""" Base class for all configuration classes.
         Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
@@ -50,36 +50,36 @@ class PretrainedConfig(object):
 
     def __init__(self, **kwargs):
         # Attributes with defaults
-        self.output_attentions = kwargs.pop('output_attentions', False)
-        self.output_hidden_states = kwargs.pop('output_hidden_states', False)
-        self.output_past = kwargs.pop('output_past', True)  # Not used by all models
-        self.torchscript = kwargs.pop('torchscript', False)  # Only used by PyTorch models
-        self.use_bfloat16 = kwargs.pop('use_bfloat16', False)
-        self.pruned_heads = kwargs.pop('pruned_heads', {})
+        self.output_attentions = kwargs.pop("output_attentions", False)
+        self.output_hidden_states = kwargs.pop("output_hidden_states", False)
+        self.output_past = kwargs.pop("output_past", True)  # Not used by all models
+        self.torchscript = kwargs.pop("torchscript", False)  # Only used by PyTorch models
+        self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
+        self.pruned_heads = kwargs.pop("pruned_heads", {})
 
         # Is decoder is used in encoder-decoder models to differentiate encoder from decoder
-        self.is_decoder = kwargs.pop('is_decoder', False)
+        self.is_decoder = kwargs.pop("is_decoder", False)
 
         # Parameters for sequence generation
-        self.max_length = kwargs.pop('max_length', 20)
-        self.do_sample = kwargs.pop('do_sample', False)
-        self.num_beams = kwargs.pop('num_beams', 1)
-        self.temperature = kwargs.pop('temperature', 1.0)
-        self.top_k = kwargs.pop('top_k', 50)
-        self.top_p = kwargs.pop('top_p', 1.0)
-        self.repetition_penalty = kwargs.pop('repetition_penalty', 1.0)
-        self.bos_token_id = kwargs.pop('bos_token_id', 0)
-        self.pad_token_id = kwargs.pop('pad_token_id', 0)
-        self.eos_token_ids = kwargs.pop('eos_token_ids', 0)
-        self.length_penalty = kwargs.pop('length_penalty', 1.)
-        self.num_return_sequences = kwargs.pop('num_return_sequences', 1)
+        self.max_length = kwargs.pop("max_length", 20)
+        self.do_sample = kwargs.pop("do_sample", False)
+        self.num_beams = kwargs.pop("num_beams", 1)
+        self.temperature = kwargs.pop("temperature", 1.0)
+        self.top_k = kwargs.pop("top_k", 50)
+        self.top_p = kwargs.pop("top_p", 1.0)
+        self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
+        self.bos_token_id = kwargs.pop("bos_token_id", 0)
+        self.pad_token_id = kwargs.pop("pad_token_id", 0)
+        self.eos_token_ids = kwargs.pop("eos_token_ids", 0)
+        self.length_penalty = kwargs.pop("length_penalty", 1.0)
+        self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
 
         # Fine-tuning task arguments
-        self.finetuning_task = kwargs.pop('finetuning_task', None)
-        self.num_labels = kwargs.pop('num_labels', 2)
-        self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)})
+        self.finetuning_task = kwargs.pop("finetuning_task", None)
+        self.num_labels = kwargs.pop("num_labels", 2)
+        self.id2label = kwargs.pop("id2label", {i: "LABEL_{}".format(i) for i in range(self.num_labels)})
         self.id2label = dict((int(key), value) for key, value in self.id2label.items())
-        self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys())))
+        self.label2id = kwargs.pop("label2id", dict(zip(self.id2label.values(), self.id2label.keys())))
         self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
 
         # Additional attributes without default values
@@ -94,7 +94,9 @@ class PretrainedConfig(object):
         """ Save a configuration object to the directory `save_directory`, so that it
             can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
         """
-        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
+        assert os.path.isdir(
+            save_directory
+        ), "Saving path should be a directory where the model and configuration can be saved"
 
         # If we save using the predefined names, we can load using `from_pretrained`
         output_config_file = os.path.join(save_directory, CONFIG_NAME)
@@ -153,11 +155,11 @@ class PretrainedConfig(object):
             assert unused_kwargs == {'foo': False}
 
         """
-        cache_dir = kwargs.pop('cache_dir', None)
-        force_download = kwargs.pop('force_download', False)
-        resume_download = kwargs.pop('resume_download', False)
-        proxies = kwargs.pop('proxies', None)
-        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
 
         if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
             config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
@@ -170,37 +172,48 @@ class PretrainedConfig(object):
 
         try:
             # Load from URL or cache if already cached
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download,
-                                               proxies=proxies, resume_download=resume_download)
+            resolved_config_file = cached_path(
+                config_file,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+            )
             # Load config
             config = cls.from_json_file(resolved_config_file)
 
         except EnvironmentError:
             if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
                 msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
-                        config_file)
+                    config_file
+                )
             else:
-                msg = "Model name '{}' was not found in model name list ({}). " \
-                      "We assumed '{}' was a path or url to a configuration file named {} or " \
-                      "a directory containing such a file but couldn't find any such file at this path or url.".format(
+                msg = (
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url to a configuration file named {} or "
+                    "a directory containing such a file but couldn't find any such file at this path or url.".format(
                         pretrained_model_name_or_path,
-                        ', '.join(cls.pretrained_config_archive_map.keys()),
-                        config_file, CONFIG_NAME)
+                        ", ".join(cls.pretrained_config_archive_map.keys()),
+                        config_file,
+                        CONFIG_NAME,
+                    )
+                )
             raise EnvironmentError(msg)
 
         except json.JSONDecodeError:
-            msg = "Couldn't reach server at '{}' to download configuration file or " \
-                  "configuration file is not a valid JSON file. " \
-                  "Please check network or file content here: {}.".format(config_file, resolved_config_file)
+            msg = (
+                "Couldn't reach server at '{}' to download configuration file or "
+                "configuration file is not a valid JSON file. "
+                "Please check network or file content here: {}.".format(config_file, resolved_config_file)
+            )
             raise EnvironmentError(msg)
 
         if resolved_config_file == config_file:
             logger.info("loading configuration file {}".format(config_file))
         else:
-            logger.info("loading configuration file {} from cache at {}".format(
-                config_file, resolved_config_file))
+            logger.info("loading configuration file {} from cache at {}".format(config_file, resolved_config_file))
 
-        if hasattr(config, 'pruned_heads'):
+        if hasattr(config, "pruned_heads"):
             config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
 
         # Update config with kwargs if needed
@@ -226,7 +239,7 @@ class PretrainedConfig(object):
     @classmethod
     def from_json_file(cls, json_file):
         """Constructs a `Config` from a json file of parameters."""
-        with open(json_file, "r", encoding='utf-8') as reader:
+        with open(json_file, "r", encoding="utf-8") as reader:
             text = reader.read()
         dict_obj = json.loads(text)
         return cls(**dict_obj)
@@ -248,5 +261,5 @@ class PretrainedConfig(object):
 
     def to_json_file(self, json_file_path):
         """ Save this instance to a json file."""
-        with open(json_file_path, "w", encoding='utf-8') as writer:
+        with open(json_file_path, "w", encoding="utf-8") as writer:
             writer.write(self.to_json_string())
diff --git a/transformers/configuration_xlm.py b/transformers/configuration_xlm.py
index 727f319778..a98024e9e6 100644
--- a/transformers/configuration_xlm.py
+++ b/transformers/configuration_xlm.py
@@ -25,16 +25,16 @@ from .configuration_utils import PretrainedConfig
 logger = logging.getLogger(__name__)
 
 XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
-    'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json",
-    'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json",
-    'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json",
-    'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json",
-    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json",
-    'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
-    'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
-    'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json",
-    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json",
+    "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
+    "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json",
+    "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json",
+    "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json",
+    "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json",
+    "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json",
+    "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
+    "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
+    "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json",
+    "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json",
 }
 
 
@@ -78,41 +78,44 @@ class XLMConfig(PretrainedConfig):
             -1 means no clamping.
         same_length: bool, whether to use the same attention length for each token.
     """
+
     pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-    def __init__(self,
-                 vocab_size=30145,
-                 emb_dim=2048,
-                 n_layers=12,
-                 n_heads=16,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 gelu_activation=True,
-                 sinusoidal_embeddings=False,
-                 causal=False,
-                 asm=False,
-                 n_langs=1,
-                 use_lang_emb=True,
-                 max_position_embeddings=512,
-                 embed_init_std=2048 ** -0.5,
-                 layer_norm_eps=1e-12,
-                 init_std=0.02,
-                 bos_index=0,
-                 eos_index=1,
-                 pad_index=2,
-                 unk_index=3,
-                 mask_index=5,
-                 is_encoder=True,
-                 summary_type='first',
-                 summary_use_proj=True,
-                 summary_activation=None,
-                 summary_proj_to_labels=True,
-                 summary_first_dropout=0.1,
-                 start_n_top=5,
-                 end_n_top=5,
-                 mask_token_id=0,
-                 lang_id=0,
-                 **kwargs):
+    def __init__(
+        self,
+        vocab_size=30145,
+        emb_dim=2048,
+        n_layers=12,
+        n_heads=16,
+        dropout=0.1,
+        attention_dropout=0.1,
+        gelu_activation=True,
+        sinusoidal_embeddings=False,
+        causal=False,
+        asm=False,
+        n_langs=1,
+        use_lang_emb=True,
+        max_position_embeddings=512,
+        embed_init_std=2048 ** -0.5,
+        layer_norm_eps=1e-12,
+        init_std=0.02,
+        bos_index=0,
+        eos_index=1,
+        pad_index=2,
+        unk_index=3,
+        mask_index=5,
+        is_encoder=True,
+        summary_type="first",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        start_n_top=5,
+        end_n_top=5,
+        mask_token_id=0,
+        lang_id=0,
+        **kwargs
+    ):
         """Constructs XLMConfig.
         """
         super(XLMConfig, self).__init__(**kwargs)
diff --git a/transformers/configuration_xlm_roberta.py b/transformers/configuration_xlm_roberta.py
index 5b6955f4f8..fcf5c571d8 100644
--- a/transformers/configuration_xlm_roberta.py
+++ b/transformers/configuration_xlm_roberta.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """ XLM-RoBERTa configuration """
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 
@@ -25,12 +24,12 @@ from .configuration_roberta import RobertaConfig
 logger = logging.getLogger(__name__)
 
 XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json",
-    'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json",
-    'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json",
-    'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json",
-    'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json",
-    'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json",
+    "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json",
+    "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json",
+    "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json",
+    "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json",
+    "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json",
+    "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json",
 }
 
 
diff --git a/transformers/configuration_xlnet.py b/transformers/configuration_xlnet.py
index 017c57cfd5..8768aeac9b 100644
--- a/transformers/configuration_xlnet.py
+++ b/transformers/configuration_xlnet.py
@@ -26,8 +26,8 @@ from .configuration_utils import PretrainedConfig
 logger = logging.getLogger(__name__)
 
 XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json",
-    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
+    "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json",
+    "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
 }
 
 
@@ -69,32 +69,35 @@ class XLNetConfig(PretrainedConfig):
         same_length: bool, whether to use the same attention length for each token.
         finetuning_task: name of the glue task on which the model was fine-tuned if any
     """
+
     pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-    def __init__(self,
-                 vocab_size=32000,
-                 d_model=1024,
-                 n_layer=24,
-                 n_head=16,
-                 d_inner=4096,
-                 ff_activation="gelu",
-                 untie_r=True,
-                 attn_type="bi",
-                 initializer_range=0.02,
-                 layer_norm_eps=1e-12,
-                 dropout=0.1,
-                 mem_len=None,
-                 reuse_len=None,
-                 bi_data=False,
-                 clamp_len=-1,
-                 same_length=False,
-                 summary_type='last',
-                 summary_use_proj=True,
-                 summary_activation='tanh',
-                 summary_last_dropout=0.1,
-                 start_n_top=5,
-                 end_n_top=5,
-                 **kwargs):
+    def __init__(
+        self,
+        vocab_size=32000,
+        d_model=1024,
+        n_layer=24,
+        n_head=16,
+        d_inner=4096,
+        ff_activation="gelu",
+        untie_r=True,
+        attn_type="bi",
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        dropout=0.1,
+        mem_len=None,
+        reuse_len=None,
+        bi_data=False,
+        clamp_len=-1,
+        same_length=False,
+        summary_type="last",
+        summary_use_proj=True,
+        summary_activation="tanh",
+        summary_last_dropout=0.1,
+        start_n_top=5,
+        end_n_top=5,
+        **kwargs
+    ):
         """Constructs XLNetConfig.
         """
         super(XLNetConfig, self).__init__(**kwargs)
diff --git a/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py b/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
index b6476b4fb6..733f6fc5ca 100644
--- a/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
@@ -24,6 +24,7 @@ import torch
 from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
 
@@ -44,24 +45,19 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pyt
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--tf_checkpoint_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the TensorFlow checkpoint path.")
-    parser.add_argument("--albert_config_file",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "The config json file corresponding to the pre-trained ALBERT model. \n"
-                            "This specifies the model architecture.")
-    parser.add_argument("--pytorch_dump_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output PyTorch model.")
+    parser.add_argument(
+        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--albert_config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the pre-trained ALBERT model. \n"
+        "This specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
     args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                     args.albert_config_file,
-                                     args.pytorch_dump_path)
- 
\ No newline at end of file
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path)
diff --git a/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
index 75808811ef..9393068b17 100755
--- a/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
@@ -24,8 +24,10 @@ import torch
 from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
+
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
     # Initialise PyTorch model
     config = BertConfig.from_json_file(bert_config_file)
@@ -43,23 +45,19 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytor
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--tf_checkpoint_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the TensorFlow checkpoint path.")
-    parser.add_argument("--bert_config_file",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "The config json file corresponding to the pre-trained BERT model. \n"
-                            "This specifies the model architecture.")
-    parser.add_argument("--pytorch_dump_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output PyTorch model.")
+    parser.add_argument(
+        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--bert_config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the pre-trained BERT model. \n"
+        "This specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
     args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                     args.bert_config_file,
-                                     args.pytorch_dump_path)
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
diff --git a/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py b/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
index 35866caac4..304c634502 100644
--- a/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
+++ b/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
@@ -23,7 +23,7 @@ import tensorflow as tf
 from transformers import BertModel
 
 
-def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
+def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str):
 
     """
     :param model:BertModel Pytorch model instance to be converted
@@ -41,22 +41,17 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s
         N BertForQuestionAnswering
     """
 
-    tensors_to_transpose = (
-        "dense.weight",
-        "attention.self.query",
-        "attention.self.key",
-        "attention.self.value"
-    )
+    tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value")
 
     var_map = (
-        ('layer.', 'layer_'),
-        ('word_embeddings.weight', 'word_embeddings'),
-        ('position_embeddings.weight', 'position_embeddings'),
-        ('token_type_embeddings.weight', 'token_type_embeddings'),
-        ('.', '/'),
-        ('LayerNorm/weight', 'LayerNorm/gamma'),
-        ('LayerNorm/bias', 'LayerNorm/beta'),
-        ('weight', 'kernel')
+        ("layer.", "layer_"),
+        ("word_embeddings.weight", "word_embeddings"),
+        ("position_embeddings.weight", "position_embeddings"),
+        ("token_type_embeddings.weight", "token_type_embeddings"),
+        (".", "/"),
+        ("LayerNorm/weight", "LayerNorm/gamma"),
+        ("LayerNorm/bias", "LayerNorm/beta"),
+        ("weight", "kernel"),
     )
 
     if not os.path.isdir(ckpt_dir):
@@ -64,12 +59,12 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s
 
     state_dict = model.state_dict()
 
-    def to_tf_var_name(name:str):
+    def to_tf_var_name(name: str):
         for patt, repl in iter(var_map):
             name = name.replace(patt, repl)
-        return 'bert/{}'.format(name)
+        return "bert/{}".format(name)
 
-    def create_tf_var(tensor:np.ndarray, name:str, session:tf.Session):
+    def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
         tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
         tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
         session.run(tf.variables_initializer([tf_var]))
@@ -94,37 +89,22 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s
 
 def main(raw_args=None):
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name",
-                        type=str,
-                        required=True,
-                        help="model name e.g. bert-base-uncased")
-    parser.add_argument("--cache_dir",
-                        type=str,
-                        default=None,
-                        required=False,
-                        help="Directory containing pytorch model")
-    parser.add_argument("--pytorch_model_path",
-                        type=str,
-                        required=True,
-                        help="/path/to/<pytorch-model-name>.bin")
-    parser.add_argument("--tf_cache_dir",
-                        type=str,
-                        required=True,
-                        help="Directory in which to save tensorflow model")
+    parser.add_argument("--model_name", type=str, required=True, help="model name e.g. bert-base-uncased")
+    parser.add_argument(
+        "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model"
+    )
+    parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/<pytorch-model-name>.bin")
+    parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model")
     args = parser.parse_args(raw_args)
-    
+
     model = BertModel.from_pretrained(
         pretrained_model_name_or_path=args.model_name,
         state_dict=torch.load(args.pytorch_model_path),
-        cache_dir=args.cache_dir
-    )
-    
-    convert_pytorch_checkpoint_to_tf(
-        model=model,
-        ckpt_dir=args.tf_cache_dir,
-        model_name=args.model_name
+        cache_dir=args.cache_dir,
     )
 
+    convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name)
+
 
 if __name__ == "__main__":
     main()
diff --git a/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
index e2328c08ca..eeafdb81e5 100755
--- a/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
@@ -21,12 +21,10 @@ from io import open
 
 import torch
 
-from transformers import (CONFIG_NAME, WEIGHTS_NAME,
-                                                     GPT2Config,
-                                                     GPT2Model,
-                                                     load_tf_weights_in_gpt2)
+from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
 
@@ -42,8 +40,8 @@ def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, p
     load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
 
     # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
+    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
     torch.save(model.state_dict(), pytorch_weights_dump_path)
     print("Save configuration file to {}".format(pytorch_config_dump_path))
@@ -54,22 +52,18 @@ def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, p
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--gpt2_checkpoint_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the TensorFlow checkpoint path.")
-    parser.add_argument("--pytorch_dump_folder_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output PyTorch model.")
-    parser.add_argument("--gpt2_config_file",
-                        default = "",
-                        type = str,
-                        help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
-                            "This specifies the model architecture.")
+    parser.add_argument(
+        "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--gpt2_config_file",
+        default="",
+        type=str,
+        help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
+        "This specifies the model architecture.",
+    )
     args = parser.parse_args()
-    convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path,
-                                         args.gpt2_config_file,
-                                         args.pytorch_dump_folder_path)
+    convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path)
diff --git a/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py b/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
index 13ebecf2fd..c87bb9d594 100755
--- a/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
@@ -21,12 +21,10 @@ from io import open
 
 import torch
 
-from transformers import (CONFIG_NAME, WEIGHTS_NAME,
-                                                     OpenAIGPTConfig,
-                                                     OpenAIGPTModel,
-                                                     load_tf_weights_in_openai_gpt)
+from transformers import CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
 
@@ -42,8 +40,8 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c
     load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)
 
     # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
+    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
     torch.save(model.state_dict(), pytorch_weights_dump_path)
     print("Save configuration file to {}".format(pytorch_config_dump_path))
@@ -54,22 +52,24 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--openai_checkpoint_folder_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the TensorFlow checkpoint path.")
-    parser.add_argument("--pytorch_dump_folder_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output PyTorch model.")
-    parser.add_argument("--openai_config_file",
-                        default = "",
-                        type = str,
-                        help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
-                            "This specifies the model architecture.")
+    parser.add_argument(
+        "--openai_checkpoint_folder_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the TensorFlow checkpoint path.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--openai_config_file",
+        default="",
+        type=str,
+        help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
+        "This specifies the model architecture.",
+    )
     args = parser.parse_args()
-    convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path,
-                                         args.openai_config_file,
-                                         args.pytorch_dump_folder_path)
+    convert_openai_checkpoint_to_pytorch(
+        args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path
+    )
diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py
index 0edac6fb7d..c7ad66e132 100644
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -24,82 +24,270 @@ import tensorflow as tf
 
 from transformers import is_torch_available, cached_path
 
-from transformers import (load_pytorch_checkpoint_in_tf2_model,
-    BertConfig, TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  GPT2Config, TFGPT2LMHeadModel, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  XLNetConfig, TFXLNetLMHeadModel, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  XLMConfig, TFXLMWithLMHeadModel, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  TransfoXLConfig, TFTransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, TFDistilBertForSequenceClassification, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  AlbertConfig, TFAlbertForMaskedLM, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  T5Config, TFT5WithLMHeadModel, T5_PRETRAINED_CONFIG_ARCHIVE_MAP)
+from transformers import (
+    load_pytorch_checkpoint_in_tf2_model,
+    BertConfig,
+    TFBertForPreTraining,
+    TFBertForQuestionAnswering,
+    TFBertForSequenceClassification,
+    BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    GPT2Config,
+    TFGPT2LMHeadModel,
+    GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    XLNetConfig,
+    TFXLNetLMHeadModel,
+    XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    XLMConfig,
+    TFXLMWithLMHeadModel,
+    XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    TransfoXLConfig,
+    TFTransfoXLLMHeadModel,
+    TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    OpenAIGPTConfig,
+    TFOpenAIGPTLMHeadModel,
+    OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    RobertaConfig,
+    TFRobertaForMaskedLM,
+    TFRobertaForSequenceClassification,
+    ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    DistilBertConfig,
+    TFDistilBertForMaskedLM,
+    TFDistilBertForQuestionAnswering,
+    TFDistilBertForSequenceClassification,
+    DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    CTRLConfig,
+    TFCTRLLMHeadModel,
+    CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    AlbertConfig,
+    TFAlbertForMaskedLM,
+    ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    T5Config,
+    TFT5WithLMHeadModel,
+    T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
+)
 
 if is_torch_available():
     import torch
     import numpy as np
-    from transformers import (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers import (
+        BertForPreTraining,
+        BertForQuestionAnswering,
+        BertForSequenceClassification,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        GPT2LMHeadModel,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLNetLMHeadModel,
+        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLMWithLMHeadModel,
+        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TransfoXLLMHeadModel,
+        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        OpenAIGPTLMHeadModel,
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        RobertaForMaskedLM,
+        RobertaForSequenceClassification,
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        DistilBertForMaskedLM,
+        DistilBertForQuestionAnswering,
+        DistilBertForSequenceClassification,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        CTRLLMHeadModel,
+        CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        AlbertForMaskedLM,
+        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        T5WithLMHeadModel,
+        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 else:
-    (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-    XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-    XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
-    TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
-    OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-    DistilBertForMaskedLM, DistilBertForSequenceClassification, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
-    AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP) = (
-        None, None, None, None,
-        None, None,
-        None, None,
-        None, None,
-        None, None,
-        None, None,
-        None, None, None,
-        None, None, None, None,
-        None, None,
-        None, None,
-        None, None)
+    (
+        BertForPreTraining,
+        BertForQuestionAnswering,
+        BertForSequenceClassification,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        GPT2LMHeadModel,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLNetLMHeadModel,
+        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLMWithLMHeadModel,
+        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TransfoXLLMHeadModel,
+        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        OpenAIGPTLMHeadModel,
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        RobertaForMaskedLM,
+        RobertaForSequenceClassification,
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        DistilBertForMaskedLM,
+        DistilBertForSequenceClassification,
+        DistilBertForQuestionAnswering,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        CTRLLMHeadModel,
+        CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        AlbertForMaskedLM,
+        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        T5WithLMHeadModel,
+        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+    ) = (
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    )
 
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, TFBertForPreTraining, BertForPreTraining, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'bert-large-uncased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'bert-large-cased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'bert-base-cased-finetuned-mrpc': (BertConfig, TFBertForSequenceClassification, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'gpt2': (GPT2Config, TFGPT2LMHeadModel, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'xlnet': (XLNetConfig, TFXLNetLMHeadModel, XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'xlm': (XLMConfig, TFXLMWithLMHeadModel, XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'transfo-xl': (TransfoXLConfig, TFTransfoXLLMHeadModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'openai-gpt': (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'roberta': (RobertaConfig, TFRobertaForMaskedLM, RobertaForMaskedLM, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'albert': (AlbertConfig, TFAlbertForMaskedLM, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    't5': (T5Config, TFT5WithLMHeadModel, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    "bert": (
+        BertConfig,
+        TFBertForPreTraining,
+        BertForPreTraining,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "bert-large-uncased-whole-word-masking-finetuned-squad": (
+        BertConfig,
+        TFBertForQuestionAnswering,
+        BertForQuestionAnswering,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "bert-large-cased-whole-word-masking-finetuned-squad": (
+        BertConfig,
+        TFBertForQuestionAnswering,
+        BertForQuestionAnswering,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "bert-base-cased-finetuned-mrpc": (
+        BertConfig,
+        TFBertForSequenceClassification,
+        BertForSequenceClassification,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "gpt2": (
+        GPT2Config,
+        TFGPT2LMHeadModel,
+        GPT2LMHeadModel,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+        GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "xlnet": (
+        XLNetConfig,
+        TFXLNetLMHeadModel,
+        XLNetLMHeadModel,
+        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "xlm": (
+        XLMConfig,
+        TFXLMWithLMHeadModel,
+        XLMWithLMHeadModel,
+        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "transfo-xl": (
+        TransfoXLConfig,
+        TFTransfoXLLMHeadModel,
+        TransfoXLLMHeadModel,
+        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "openai-gpt": (
+        OpenAIGPTConfig,
+        TFOpenAIGPTLMHeadModel,
+        OpenAIGPTLMHeadModel,
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "roberta": (
+        RobertaConfig,
+        TFRobertaForMaskedLM,
+        RobertaForMaskedLM,
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "roberta-large-mnli": (
+        RobertaConfig,
+        TFRobertaForSequenceClassification,
+        RobertaForSequenceClassification,
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "distilbert": (
+        DistilBertConfig,
+        TFDistilBertForMaskedLM,
+        DistilBertForMaskedLM,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "distilbert-base-uncased-distilled-squad": (
+        DistilBertConfig,
+        TFDistilBertForQuestionAnswering,
+        DistilBertForQuestionAnswering,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "distilbert-base-uncased-distilled-squad": (
+        DistilBertConfig,
+        TFDistilBertForQuestionAnswering,
+        DistilBertForQuestionAnswering,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "ctrl": (
+        CTRLConfig,
+        TFCTRLLMHeadModel,
+        CTRLLMHeadModel,
+        CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "albert": (
+        AlbertConfig,
+        TFAlbertForMaskedLM,
+        AlbertForMaskedLM,
+        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "t5": (
+        T5Config,
+        TFT5WithLMHeadModel,
+        T5WithLMHeadModel,
+        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+        T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
 }
 
-def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
+
+def convert_pt_checkpoint_to_tf(
+    model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True
+):
     if model_type not in MODEL_CLASSES:
         raise ValueError("Unrecognized model type, should be one of {}.".format(list(MODEL_CLASSES.keys())))
 
@@ -116,17 +304,19 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
 
     # Load weights from tf checkpoint
     if pytorch_checkpoint_path in aws_model_maps:
-        pytorch_checkpoint_path = cached_path(aws_model_maps[pytorch_checkpoint_path], force_download=not use_cached_models)
+        pytorch_checkpoint_path = cached_path(
+            aws_model_maps[pytorch_checkpoint_path], force_download=not use_cached_models
+        )
     # Load PyTorch checkpoint in tf2 model:
     tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path)
 
     if compare_with_pt_model:
         tfo = tf_model(tf_model.dummy_inputs, training=False)  # build the network
 
-        state_dict = torch.load(pytorch_checkpoint_path, map_location='cpu')
-        pt_model = pt_model_class.from_pretrained(pretrained_model_name_or_path=None,
-                                                  config=config,
-                                                  state_dict=state_dict)
+        state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu")
+        pt_model = pt_model_class.from_pretrained(
+            pretrained_model_name_or_path=None, config=config, state_dict=state_dict
+        )
 
         with torch.no_grad():
             pto = pt_model(**pt_model.dummy_inputs)
@@ -139,11 +329,19 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
 
     # Save pytorch-model
     print("Save TensorFlow model to {}".format(tf_dump_path))
-    tf_model.save_weights(tf_dump_path, save_format='h5')
+    tf_model.save_weights(tf_dump_path, save_format="h5")
 
 
-def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None,
-                                     compare_with_pt_model=False, use_cached_models=False, remove_cached_files=False, only_convert_finetuned_models=False):
+def convert_all_pt_checkpoints_to_tf(
+    args_model_type,
+    tf_dump_path,
+    model_shortcut_names_or_path=None,
+    config_shortcut_names_or_path=None,
+    compare_with_pt_model=False,
+    use_cached_models=False,
+    remove_cached_files=False,
+    only_convert_finetuned_models=False,
+):
     assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"
 
     if args_model_type is None:
@@ -156,7 +354,9 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
         print(" Converting model type {}/{}: {}".format(j, len(model_types), model_type))
         print("=" * 100)
         if model_type not in MODEL_CLASSES:
-            raise ValueError("Unrecognized model type {}, should be one of {}.".format(model_type, list(MODEL_CLASSES.keys())))
+            raise ValueError(
+                "Unrecognized model type {}, should be one of {}.".format(model_type, list(MODEL_CLASSES.keys()))
+            )
 
         config_class, model_class, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type]
 
@@ -166,9 +366,10 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
             config_shortcut_names_or_path = model_shortcut_names_or_path
 
         for i, (model_shortcut_name, config_shortcut_name) in enumerate(
-                zip(model_shortcut_names_or_path, config_shortcut_names_or_path), start=1):
+            zip(model_shortcut_names_or_path, config_shortcut_names_or_path), start=1
+        ):
             print("-" * 100)
-            if '-squad' in model_shortcut_name or '-mrpc' in model_shortcut_name or '-mnli' in model_shortcut_name:
+            if "-squad" in model_shortcut_name or "-mrpc" in model_shortcut_name or "-mnli" in model_shortcut_name:
                 if not only_convert_finetuned_models:
                     print("    Skipping finetuned checkpoint {}".format(model_shortcut_name))
                     continue
@@ -176,7 +377,11 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
             elif only_convert_finetuned_models:
                 print("    Skipping not finetuned checkpoint {}".format(model_shortcut_name))
                 continue
-            print("    Converting checkpoint {}/{}: {} - model_type {}".format(i, len(aws_config_map), model_shortcut_name, model_type))
+            print(
+                "    Converting checkpoint {}/{}: {} - model_type {}".format(
+                    i, len(aws_config_map), model_shortcut_name, model_type
+                )
+            )
             print("-" * 100)
 
             if config_shortcut_name in aws_config_map:
@@ -190,13 +395,15 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
                 model_file = cached_path(model_shortcut_name, force_download=not use_cached_models)
 
             if os.path.isfile(model_shortcut_name):
-                model_shortcut_name = 'converted_model'
+                model_shortcut_name = "converted_model"
 
-            convert_pt_checkpoint_to_tf(model_type=model_type,
-                                        pytorch_checkpoint_path=model_file,
-                                        config_file=config_file,
-                                        tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
-                                        compare_with_pt_model=compare_with_pt_model)
+            convert_pt_checkpoint_to_tf(
+                model_type=model_type,
+                pytorch_checkpoint_path=model_file,
+                config_file=config_file,
+                tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + "-tf_model.h5"),
+                compare_with_pt_model=compare_with_pt_model,
+            )
             if remove_cached_files:
                 os.remove(config_file)
                 os.remove(model_file)
@@ -205,39 +412,47 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--tf_dump_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output Tensorflow dump file.")
-    parser.add_argument("--model_type",
-                        default = None,
-                        type = str,
-                        help = "Model type selected in the list of {}. If not given, will download and convert all the models from AWS.".format(list(MODEL_CLASSES.keys())))
-    parser.add_argument("--pytorch_checkpoint_path",
-                        default = None,
-                        type = str,
-                        help = "Path to the PyTorch checkpoint path or shortcut name to download from AWS. "
-                               "If not given, will download and convert all the checkpoints from AWS.")
-    parser.add_argument("--config_file",
-                        default = None,
-                        type = str,
-                        help = "The config json file corresponding to the pre-trained model. \n"
-                               "This specifies the model architecture. If not given and "
-                               "--pytorch_checkpoint_path is not given or is a shortcut name"
-                               "use the configuration associated to the shortcut name on the AWS")
-    parser.add_argument("--compare_with_pt_model",
-                        action='store_true',
-                        help = "Compare Tensorflow and PyTorch model predictions.")
-    parser.add_argument("--use_cached_models",
-                        action='store_true',
-                        help = "Use cached models if possible instead of updating to latest checkpoint versions.")
-    parser.add_argument("--remove_cached_files",
-                        action='store_true',
-                        help = "Remove pytorch models after conversion (save memory when converting in batches).")
-    parser.add_argument("--only_convert_finetuned_models",
-                        action='store_true',
-                        help = "Only convert finetuned models.")
+    parser.add_argument(
+        "--tf_dump_path", default=None, type=str, required=True, help="Path to the output Tensorflow dump file."
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        help="Model type selected in the list of {}. If not given, will download and convert all the models from AWS.".format(
+            list(MODEL_CLASSES.keys())
+        ),
+    )
+    parser.add_argument(
+        "--pytorch_checkpoint_path",
+        default=None,
+        type=str,
+        help="Path to the PyTorch checkpoint path or shortcut name to download from AWS. "
+        "If not given, will download and convert all the checkpoints from AWS.",
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        help="The config json file corresponding to the pre-trained model. \n"
+        "This specifies the model architecture. If not given and "
+        "--pytorch_checkpoint_path is not given or is a shortcut name"
+        "use the configuration associated to the shortcut name on the AWS",
+    )
+    parser.add_argument(
+        "--compare_with_pt_model", action="store_true", help="Compare Tensorflow and PyTorch model predictions."
+    )
+    parser.add_argument(
+        "--use_cached_models",
+        action="store_true",
+        help="Use cached models if possible instead of updating to latest checkpoint versions.",
+    )
+    parser.add_argument(
+        "--remove_cached_files",
+        action="store_true",
+        help="Remove pytorch models after conversion (save memory when converting in batches).",
+    )
+    parser.add_argument("--only_convert_finetuned_models", action="store_true", help="Only convert finetuned models.")
     args = parser.parse_args()
 
     # if args.pytorch_checkpoint_path is not None:
@@ -248,11 +463,15 @@ if __name__ == "__main__":
     #                                 compare_with_pt_model=args.compare_with_pt_model,
     #                                 use_cached_models=args.use_cached_models)
     # else:
-    convert_all_pt_checkpoints_to_tf(args.model_type.lower() if args.model_type is not None else None,
-                                        args.tf_dump_path,
-                                        model_shortcut_names_or_path=[args.pytorch_checkpoint_path] if args.pytorch_checkpoint_path is not None else None,
-                                        config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
-                                        compare_with_pt_model=args.compare_with_pt_model,
-                                        use_cached_models=args.use_cached_models,
-                                        remove_cached_files=args.remove_cached_files,
-                                        only_convert_finetuned_models=args.only_convert_finetuned_models)
+    convert_all_pt_checkpoints_to_tf(
+        args.model_type.lower() if args.model_type is not None else None,
+        args.tf_dump_path,
+        model_shortcut_names_or_path=[args.pytorch_checkpoint_path]
+        if args.pytorch_checkpoint_path is not None
+        else None,
+        config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
+        compare_with_pt_model=args.compare_with_pt_model,
+        use_cached_models=args.use_cached_models,
+        remove_cached_files=args.remove_cached_files,
+        only_convert_finetuned_models=args.only_convert_finetuned_models,
+    )
diff --git a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
index fedfc1ecb8..3dec4882ff 100644
--- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -30,20 +30,27 @@ if version.parse(fairseq.__version__) < version.parse("0.9.0"):
 
 from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
 from fairseq.modules import TransformerSentenceEncoderLayer
-from transformers.modeling_bert import (BertConfig, BertEncoder,
-                                        BertIntermediate, BertLayer,
-                                        BertModel, BertOutput,
-                                        BertSelfAttention,
-                                        BertSelfOutput)
-from transformers.modeling_roberta import (RobertaEmbeddings,
-                                           RobertaForMaskedLM,
-                                           RobertaForSequenceClassification,
-                                           RobertaModel)
+from transformers.modeling_bert import (
+    BertConfig,
+    BertEncoder,
+    BertIntermediate,
+    BertLayer,
+    BertModel,
+    BertOutput,
+    BertSelfAttention,
+    BertSelfOutput,
+)
+from transformers.modeling_roberta import (
+    RobertaEmbeddings,
+    RobertaForMaskedLM,
+    RobertaForSequenceClassification,
+    RobertaModel,
+)
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-SAMPLE_TEXT = 'Hello world! cécé herlolip'
+SAMPLE_TEXT = "Hello world! cécé herlolip"
 
 
 def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_folder_path, classification_head):
@@ -61,7 +68,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
         intermediate_size=roberta.args.encoder_ffn_embed_dim,
         max_position_embeddings=514,
         type_vocab_size=1,
-        layer_norm_eps=1e-5, # PyTorch default used in fairseq
+        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
     )
     if classification_head:
         config.num_labels = roberta.args.num_classes
@@ -74,7 +81,9 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
     # Embeddings
     model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
     model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
-    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight)  # just zero them out b/c RoBERTa doesn't use them.
+    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
+        model.roberta.embeddings.token_type_embeddings.weight
+    )  # just zero them out b/c RoBERTa doesn't use them.
     model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
     model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
 
@@ -85,11 +94,11 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
 
         ### self attention
         self_attn: BertSelfAttention = layer.attention.self
-        assert(
-            roberta_layer.self_attn.k_proj.weight.data.shape == \
-            roberta_layer.self_attn.q_proj.weight.data.shape == \
-            roberta_layer.self_attn.v_proj.weight.data.shape == \
-            torch.Size((config.hidden_size, config.hidden_size))
+        assert (
+            roberta_layer.self_attn.k_proj.weight.data.shape
+            == roberta_layer.self_attn.q_proj.weight.data.shape
+            == roberta_layer.self_attn.v_proj.weight.data.shape
+            == torch.Size((config.hidden_size, config.hidden_size))
         )
 
         self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
@@ -101,9 +110,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
 
         ### self-attention output
         self_output: BertSelfOutput = layer.attention.output
-        assert(
-            self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
-        )
+        assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
         self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
         self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
         self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
@@ -111,28 +118,24 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
 
         ### intermediate
         intermediate: BertIntermediate = layer.intermediate
-        assert(
-            intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
-        )
+        assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
         intermediate.dense.weight = roberta_layer.fc1.weight
         intermediate.dense.bias = roberta_layer.fc1.bias
 
         ### output
         bert_output: BertOutput = layer.output
-        assert(
-            bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
-        )
+        assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
         bert_output.dense.weight = roberta_layer.fc2.weight
         bert_output.dense.bias = roberta_layer.fc2.bias
         bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
         bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
         #### end of layer
-    
+
     if classification_head:
-        model.classifier.dense.weight = roberta.model.classification_heads['mnli'].dense.weight
-        model.classifier.dense.bias = roberta.model.classification_heads['mnli'].dense.bias
-        model.classifier.out_proj.weight = roberta.model.classification_heads['mnli'].out_proj.weight
-        model.classifier.out_proj.bias = roberta.model.classification_heads['mnli'].out_proj.bias
+        model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight
+        model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias
+        model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight
+        model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias
     else:
         # LM Head
         model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight
@@ -143,21 +146,18 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
         model.lm_head.bias = roberta.model.decoder.lm_head.bias
 
     # Let's check that we get the same results.
-    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1
+    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
 
     our_output = model(input_ids)[0]
     if classification_head:
-        their_output = roberta.model.classification_heads['mnli'](roberta.extract_features(input_ids))
+        their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids))
     else:
         their_output = roberta.model(input_ids)[0]
     print(our_output.shape, their_output.shape)
     max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
-    print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
+    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
     success = torch.allclose(our_output, their_output, atol=1e-3)
-    print(
-        "Do both models output the same tensors?",
-        "🔥" if success else "💩"
-    )
+    print("Do both models output the same tensors?", "🔥" if success else "💩")
     if not success:
         raise Exception("Something went wRoNg")
 
@@ -169,23 +169,16 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--roberta_checkpoint_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path the official PyTorch dump.")
-    parser.add_argument("--pytorch_dump_folder_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output PyTorch model.")
-    parser.add_argument("--classification_head",
-                        action = "store_true",
-                        help = "Whether to convert a final classification head.")
+    parser.add_argument(
+        "--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--classification_head", action="store_true", help="Whether to convert a final classification head."
+    )
     args = parser.parse_args()
     convert_roberta_checkpoint_to_pytorch(
-        args.roberta_checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.classification_head
+        args.roberta_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
     )
-
diff --git a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
index 2b74d2dd93..0b22a5f9c6 100755
--- a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
@@ -24,8 +24,10 @@ import torch
 from transformers import T5Config, T5Model, load_tf_weights_in_t5
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
+
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
     # Initialise PyTorch model
     config = T5Config.from_json_file(config_file)
@@ -43,23 +45,19 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_du
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--tf_checkpoint_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the TensorFlow checkpoint path.")
-    parser.add_argument("--config_file",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "The config json file corresponding to the pre-trained T5 model. \n"
-                            "This specifies the model architecture.")
-    parser.add_argument("--pytorch_dump_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output PyTorch model.")
+    parser.add_argument(
+        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the pre-trained T5 model. \n"
+        "This specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
     args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                     args.config_file,
-                                     args.pytorch_dump_path)
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
diff --git a/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py b/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
index a5ff4ed22c..f8dd45ae55 100755
--- a/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
@@ -26,9 +26,8 @@ import torch
 import transformers.tokenization_transfo_xl as data_utils
 
 from transformers import CONFIG_NAME, WEIGHTS_NAME
-from transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
-                                                      load_tf_weights_in_transfo_xl)
-from transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
+from transformers import TransfoXLConfig, TransfoXLLMHeadModel, load_tf_weights_in_transfo_xl
+from transformers.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
@@ -36,32 +35,33 @@ else:
     import pickle
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
 # We do this to be able to load python 2 datasets pickles
 # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
 data_utils.Vocab = data_utils.TransfoXLTokenizer
 data_utils.Corpus = data_utils.TransfoXLCorpus
-sys.modules['data_utils'] = data_utils
-sys.modules['vocabulary'] = data_utils
+sys.modules["data_utils"] = data_utils
+sys.modules["vocabulary"] = data_utils
 
-def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
-                                             transfo_xl_config_file,
-                                             pytorch_dump_folder_path,
-                                             transfo_xl_dataset_file):
+
+def convert_transfo_xl_checkpoint_to_pytorch(
+    tf_checkpoint_path, transfo_xl_config_file, pytorch_dump_folder_path, transfo_xl_dataset_file
+):
     if transfo_xl_dataset_file:
         # Convert a pre-processed corpus (see original TensorFlow repo)
         with open(transfo_xl_dataset_file, "rb") as fp:
             corpus = pickle.load(fp, encoding="latin1")
         # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
-        pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['pretrained_vocab_file']
+        pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["pretrained_vocab_file"]
         print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
         corpus_vocab_dict = corpus.vocab.__dict__
         torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
 
         corpus_dict_no_vocab = corpus.__dict__
-        corpus_dict_no_vocab.pop('vocab', None)
-        pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME
+        corpus_dict_no_vocab.pop("vocab", None)
+        pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME
         print("Save dataset to {}".format(pytorch_dataset_dump_path))
         torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
 
@@ -92,26 +92,36 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the folder to store the PyTorch model or dataset/vocab.")
-    parser.add_argument("--tf_checkpoint_path",
-                        default = "",
-                        type = str,
-                        help = "An optional path to a TensorFlow checkpoint path to be converted.")
-    parser.add_argument("--transfo_xl_config_file",
-                        default = "",
-                        type = str,
-                        help = "An optional config json file corresponding to the pre-trained BERT model. \n"
-                            "This specifies the model architecture.")
-    parser.add_argument("--transfo_xl_dataset_file",
-                        default = "",
-                        type = str,
-                        help = "An optional dataset file to be converted in a vocabulary.")
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the folder to store the PyTorch model or dataset/vocab.",
+    )
+    parser.add_argument(
+        "--tf_checkpoint_path",
+        default="",
+        type=str,
+        help="An optional path to a TensorFlow checkpoint path to be converted.",
+    )
+    parser.add_argument(
+        "--transfo_xl_config_file",
+        default="",
+        type=str,
+        help="An optional config json file corresponding to the pre-trained BERT model. \n"
+        "This specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--transfo_xl_dataset_file",
+        default="",
+        type=str,
+        help="An optional dataset file to be converted in a vocabulary.",
+    )
     args = parser.parse_args()
-    convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                     args.transfo_xl_config_file,
-                                     args.pytorch_dump_folder_path,
-                                     args.transfo_xl_dataset_file)
+    convert_transfo_xl_checkpoint_to_pytorch(
+        args.tf_checkpoint_path,
+        args.transfo_xl_config_file,
+        args.pytorch_dump_folder_path,
+        args.transfo_xl_dataset_file,
+    )
diff --git a/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
index 91133ef56a..7cbf9cae95 100755
--- a/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
@@ -27,32 +27,34 @@ from transformers import CONFIG_NAME, WEIGHTS_NAME
 from transformers.tokenization_xlm import VOCAB_FILES_NAMES
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
+
 def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
     # Load checkpoint
-    chkpt = torch.load(xlm_checkpoint_path, map_location='cpu')
+    chkpt = torch.load(xlm_checkpoint_path, map_location="cpu")
 
-    state_dict = chkpt['model']
+    state_dict = chkpt["model"]
 
     # We have the base model one level deeper than the original XLM repository
     two_levels_state_dict = {}
     for k, v in state_dict.items():
-        if 'pred_layer' in k:
+        if "pred_layer" in k:
             two_levels_state_dict[k] = v
         else:
-            two_levels_state_dict['transformer.' + k] = v
+            two_levels_state_dict["transformer." + k] = v
 
-    config = chkpt['params']
+    config = chkpt["params"]
     config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
 
-    vocab = chkpt['dico_word2id']
-    vocab = dict((s + '</w>' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in vocab.items())
+    vocab = chkpt["dico_word2id"]
+    vocab = dict((s + "</w>" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""), i) for s, i in vocab.items())
 
     # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
-    pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' +  VOCAB_FILES_NAMES['vocab_file']
+    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
+    pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"]
 
     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
     torch.save(two_levels_state_dict, pytorch_weights_dump_path)
@@ -69,15 +71,11 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--xlm_checkpoint_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path the official PyTorch dump.")
-    parser.add_argument("--pytorch_dump_folder_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output PyTorch model.")
+    parser.add_argument(
+        "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
     args = parser.parse_args()
     convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py b/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
index 3669d9944c..83688cf07b 100755
--- a/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
@@ -22,11 +22,15 @@ import os
 import argparse
 import torch
 
-from transformers import (CONFIG_NAME, WEIGHTS_NAME,
-                                                    XLNetConfig,
-                                                    XLNetLMHeadModel, XLNetForQuestionAnswering,
-                                                    XLNetForSequenceClassification,
-                                                    load_tf_weights_in_xlnet)
+from transformers import (
+    CONFIG_NAME,
+    WEIGHTS_NAME,
+    XLNetConfig,
+    XLNetLMHeadModel,
+    XLNetForQuestionAnswering,
+    XLNetForSequenceClassification,
+    load_tf_weights_in_xlnet,
+)
 
 GLUE_TASKS_NUM_LABELS = {
     "cola": 2,
@@ -41,9 +45,13 @@ GLUE_TASKS_NUM_LABELS = {
 }
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
-def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None):
+
+def convert_xlnet_checkpoint_to_pytorch(
+    tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None
+):
     # Initialise PyTorch model
     config = XLNetConfig.from_json_file(bert_config_file)
 
@@ -53,7 +61,7 @@ def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, py
         config.finetuning_task = finetuning_task
         config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
         model = XLNetForSequenceClassification(config)
-    elif 'squad' in finetuning_task:
+    elif "squad" in finetuning_task:
         config.finetuning_task = finetuning_task
         model = XLNetForQuestionAnswering(config)
     else:
@@ -75,30 +83,33 @@ def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, py
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--tf_checkpoint_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the TensorFlow checkpoint path.")
-    parser.add_argument("--xlnet_config_file",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "The config json file corresponding to the pre-trained XLNet model. \n"
-                               "This specifies the model architecture.")
-    parser.add_argument("--pytorch_dump_folder_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the folder to store the PyTorch model or dataset/vocab.")
-    parser.add_argument("--finetuning_task",
-                        default = None,
-                        type = str,
-                        help = "Name of a task on which the XLNet TensorFloaw model was fine-tuned")
+    parser.add_argument(
+        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--xlnet_config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the pre-trained XLNet model. \n"
+        "This specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the folder to store the PyTorch model or dataset/vocab.",
+    )
+    parser.add_argument(
+        "--finetuning_task",
+        default=None,
+        type=str,
+        help="Name of a task on which the XLNet TensorFloaw model was fine-tuned",
+    )
     args = parser.parse_args()
     print(args)
 
-    convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                        args.xlnet_config_file,
-                                        args.pytorch_dump_folder_path,
-                                        args.finetuning_task)
+    convert_xlnet_checkpoint_to_pytorch(
+        args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, args.finetuning_task
+    )
diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py
index 5567952fd2..bac6c6e3af 100644
--- a/transformers/data/__init__.py
+++ b/transformers/data/__init__.py
@@ -1,8 +1,15 @@
-from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures, SingleSentenceClassificationProcessor
+from .processors import (
+    InputExample,
+    InputFeatures,
+    DataProcessor,
+    SquadFeatures,
+    SingleSentenceClassificationProcessor,
+)
 from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
 from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor
 from .processors import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
 
 from .metrics import is_sklearn_available
+
 if is_sklearn_available():
     from .metrics import glue_compute_metrics, xnli_compute_metrics
diff --git a/transformers/data/metrics/__init__.py b/transformers/data/metrics/__init__.py
index 5a46eb05d3..bd3b76efc0 100644
--- a/transformers/data/metrics/__init__.py
+++ b/transformers/data/metrics/__init__.py
@@ -23,20 +23,22 @@ logger = logging.getLogger(__name__)
 try:
     from scipy.stats import pearsonr, spearmanr
     from sklearn.metrics import matthews_corrcoef, f1_score
+
     _has_sklearn = True
 except (AttributeError, ImportError) as e:
     logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html")
     _has_sklearn = False
 
+
 def is_sklearn_available():
     return _has_sklearn
 
+
 if _has_sklearn:
 
     def simple_accuracy(preds, labels):
         return (preds == labels).mean()
 
-
     def acc_and_f1(preds, labels):
         acc = simple_accuracy(preds, labels)
         f1 = f1_score(y_true=labels, y_pred=preds)
@@ -46,7 +48,6 @@ if _has_sklearn:
             "acc_and_f1": (acc + f1) / 2,
         }
 
-
     def pearson_and_spearman(preds, labels):
         pearson_corr = pearsonr(preds, labels)[0]
         spearman_corr = spearmanr(preds, labels)[0]
@@ -56,7 +57,6 @@ if _has_sklearn:
             "corr": (pearson_corr + spearman_corr) / 2,
         }
 
-
     def glue_compute_metrics(task_name, preds, labels):
         assert len(preds) == len(labels)
         if task_name == "cola":
@@ -82,7 +82,6 @@ if _has_sklearn:
         else:
             raise KeyError(task_name)
 
-
     def xnli_compute_metrics(task_name, preds, labels):
         assert len(preds) == len(labels)
         if task_name == "xnli":
diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py
index acbb884fb8..a867fe3fde 100644
--- a/transformers/data/metrics/squad_metrics.py
+++ b/transformers/data/metrics/squad_metrics.py
@@ -24,19 +24,21 @@ logger = logging.getLogger(__name__)
 
 def normalize_answer(s):
     """Lower text and remove punctuation, articles and extra whitespace."""
+
     def remove_articles(text):
-        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
-        return re.sub(regex, ' ', text)
+        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
 
     def white_space_fix(text):
-        return ' '.join(text.split())
+        return " ".join(text.split())
 
     def remove_punc(text):
         exclude = set(string.punctuation)
-        return ''.join(ch for ch in text if ch not in exclude)
+        return "".join(ch for ch in text if ch not in exclude)
 
     def lower(text):
         return text.lower()
+
     return white_space_fix(remove_articles(remove_punc(lower(s))))
 
 
@@ -75,14 +77,14 @@ def get_raw_scores(examples, preds):
 
     for example in examples:
         qas_id = example.qas_id
-        gold_answers = [answer['text'] for answer in example.answers if normalize_answer(answer['text'])]
+        gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])]
 
         if not gold_answers:
             # For unanswerable questions, only correct answer is empty string
-            gold_answers = ['']
+            gold_answers = [""]
 
         if qas_id not in preds:
-            print('Missing prediction for %s' % qas_id)
+            print("Missing prediction for %s" % qas_id)
             continue
 
         prediction = preds[qas_id]
@@ -106,23 +108,27 @@ def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
 def make_eval_dict(exact_scores, f1_scores, qid_list=None):
     if not qid_list:
         total = len(exact_scores)
-        return collections.OrderedDict([
-            ('exact', 100.0 * sum(exact_scores.values()) / total),
-            ('f1', 100.0 * sum(f1_scores.values()) / total),
-            ('total', total),
-        ])
+        return collections.OrderedDict(
+            [
+                ("exact", 100.0 * sum(exact_scores.values()) / total),
+                ("f1", 100.0 * sum(f1_scores.values()) / total),
+                ("total", total),
+            ]
+        )
     else:
         total = len(qid_list)
-        return collections.OrderedDict([
-            ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
-            ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
-            ('total', total),
-        ])
+        return collections.OrderedDict(
+            [
+                ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+                ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+                ("total", total),
+            ]
+        )
 
 
 def merge_eval(main_eval, new_eval, prefix):
     for k in new_eval:
-        main_eval['%s_%s' % (prefix, k)] = new_eval[k]
+        main_eval["%s_%s" % (prefix, k)] = new_eval[k]
 
 
 def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
@@ -160,16 +166,14 @@ def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
 
 
 def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
-    best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(
-        preds, exact_raw, na_probs, qid_to_has_ans)
-    best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(
-        preds, f1_raw, na_probs, qid_to_has_ans)
-    main_eval['best_exact'] = best_exact
-    main_eval['best_exact_thresh'] = exact_thresh
-    main_eval['best_f1'] = best_f1
-    main_eval['best_f1_thresh'] = f1_thresh
-    main_eval['has_ans_exact'] = has_ans_exact
-    main_eval['has_ans_f1'] = has_ans_f1
+    best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
+    main_eval["best_exact"] = best_exact
+    main_eval["best_exact_thresh"] = exact_thresh
+    main_eval["best_f1"] = best_f1
+    main_eval["best_f1_thresh"] = f1_thresh
+    main_eval["has_ans_exact"] = has_ans_exact
+    main_eval["has_ans_f1"] = has_ans_f1
 
 
 def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
@@ -199,10 +203,10 @@ def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_h
     best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
     best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
 
-    main_eval['best_exact'] = best_exact
-    main_eval['best_exact_thresh'] = exact_thresh
-    main_eval['best_f1'] = best_f1
-    main_eval['best_f1_thresh'] = f1_thresh
+    main_eval["best_exact"] = best_exact
+    main_eval["best_exact_thresh"] = exact_thresh
+    main_eval["best_f1"] = best_f1
+    main_eval["best_f1_thresh"] = f1_thresh
 
 
 def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
@@ -215,18 +219,20 @@ def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_
 
     exact, f1 = get_raw_scores(examples, preds)
 
-    exact_threshold = apply_no_ans_threshold(exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
+    exact_threshold = apply_no_ans_threshold(
+        exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold
+    )
     f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
 
     evaluation = make_eval_dict(exact_threshold, f1_threshold)
 
     if has_answer_qids:
         has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
-        merge_eval(evaluation, has_ans_eval, 'HasAns')
+        merge_eval(evaluation, has_ans_eval, "HasAns")
 
     if no_answer_qids:
         no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
-        merge_eval(evaluation, no_ans_eval, 'NoAns')
+        merge_eval(evaluation, no_ans_eval, "NoAns")
 
     if no_answer_probs:
         find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer)
@@ -284,8 +290,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
     start_position = tok_text.find(pred_text)
     if start_position == -1:
         if verbose_logging:
-            logger.info(
-                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+            logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
         return orig_text
     end_position = start_position + len(pred_text) - 1
 
@@ -294,8 +299,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
 
     if len(orig_ns_text) != len(tok_ns_text):
         if verbose_logging:
-            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
-                        orig_ns_text, tok_ns_text)
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
         return orig_text
 
     # We then project the characters in `pred_text` back to `orig_text` using
@@ -326,7 +330,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
             logger.info("Couldn't map end position")
         return orig_text
 
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
     return output_text
 
 
@@ -393,8 +397,8 @@ def compute_predictions_logits(
         unique_id_to_result[result.unique_id] = result
 
     _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction",
-        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
+    )
 
     all_predictions = collections.OrderedDict()
     all_nbest_json = collections.OrderedDict()
@@ -447,7 +451,9 @@ def compute_predictions_logits(
                             start_index=start_index,
                             end_index=end_index,
                             start_logit=result.start_logits[start_index],
-                            end_logit=result.end_logits[end_index]))
+                            end_logit=result.end_logits[end_index],
+                        )
+                    )
         if version_2_with_negative:
             prelim_predictions.append(
                 _PrelimPrediction(
@@ -455,14 +461,14 @@ def compute_predictions_logits(
                     start_index=0,
                     end_index=0,
                     start_logit=null_start_logit,
-                    end_logit=null_end_logit))
-        prelim_predictions = sorted(
-            prelim_predictions,
-            key=lambda x: (x.start_logit + x.end_logit),
-            reverse=True)
+                    end_logit=null_end_logit,
+                )
+            )
+        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
 
         _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-            "NbestPrediction", ["text", "start_logit", "end_logit"])
+            "NbestPrediction", ["text", "start_logit", "end_logit"]
+        )
 
         seen_predictions = {}
         nbest = []
@@ -471,10 +477,10 @@ def compute_predictions_logits(
                 break
             feature = features[pred.feature_index]
             if pred.start_index > 0:  # this is a non-null prediction
-                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+                tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
                 orig_doc_start = feature.token_to_orig_map[pred.start_index]
                 orig_doc_end = feature.token_to_orig_map[pred.end_index]
-                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+                orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
 
                 tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
 
@@ -498,31 +504,21 @@ def compute_predictions_logits(
                 final_text = ""
                 seen_predictions[final_text] = True
 
-            nbest.append(
-                _NbestPrediction(
-                    text=final_text,
-                    start_logit=pred.start_logit,
-                    end_logit=pred.end_logit))
+            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
         # if we didn't include the empty option in the n-best, include it
         if version_2_with_negative:
             if "" not in seen_predictions:
-                nbest.append(
-                    _NbestPrediction(
-                        text="",
-                        start_logit=null_start_logit,
-                        end_logit=null_end_logit))
+                nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
 
             # In very rare edge cases we could only have single null prediction.
             # So we just create a nonce prediction in this case to avoid failure.
             if len(nbest) == 1:
-                nbest.insert(0,
-                             _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+                nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
 
         # In very rare edge cases we could have no valid predictions. So we
         # just create a nonce prediction in this case to avoid failure.
         if not nbest:
-            nbest.append(
-                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+            nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
 
         assert len(nbest) >= 1
 
@@ -551,8 +547,7 @@ def compute_predictions_logits(
             all_predictions[example.qas_id] = nbest_json[0]["text"]
         else:
             # predict "" iff the null score - the score of best non-null > threshold
-            score_diff = score_null - best_non_null_entry.start_logit - (
-                best_non_null_entry.end_logit)
+            score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
             scores_diff_json[example.qas_id] = score_diff
             if score_diff > null_score_diff_threshold:
                 all_predictions[example.qas_id] = ""
@@ -586,7 +581,7 @@ def compute_predictions_log_probs(
     end_n_top,
     version_2_with_negative,
     tokenizer,
-    verbose_logging
+    verbose_logging,
 ):
     """ XLNet write prediction logic (more complex than Bert's).
         Write final predictions to the json file and log-odds of null if needed.
@@ -594,12 +589,12 @@ def compute_predictions_log_probs(
         Requires utils_squad_evaluate.py
     """
     _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction",
-        ["feature_index", "start_index", "end_index",
-         "start_log_prob", "end_log_prob"])
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
+    )
 
     _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"])
+        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]
+    )
 
     logger.info("Writing predictions to: %s", output_prediction_file)
     # logger.info("Writing nbest to: %s" % (output_nbest_file))
@@ -663,12 +658,13 @@ def compute_predictions_log_probs(
                             start_index=start_index,
                             end_index=end_index,
                             start_log_prob=start_log_prob,
-                            end_log_prob=end_log_prob))
+                            end_log_prob=end_log_prob,
+                        )
+                    )
 
         prelim_predictions = sorted(
-            prelim_predictions,
-            key=lambda x: (x.start_log_prob + x.end_log_prob),
-            reverse=True)
+            prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True
+        )
 
         seen_predictions = {}
         nbest = []
@@ -688,10 +684,10 @@ def compute_predictions_log_probs(
             # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
 
             # Previously used Bert untokenizer
-            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+            tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
             orig_doc_start = feature.token_to_orig_map[pred.start_index]
             orig_doc_end = feature.token_to_orig_map[pred.end_index]
-            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+            orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
             tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
 
             # Clean whitespace
@@ -704,8 +700,7 @@ def compute_predictions_log_probs(
             else:
                 do_lower_case = tokenizer.do_lowercase_and_remove_accent
 
-            final_text = get_final_text(tok_text, orig_text, do_lower_case,
-                                        verbose_logging)
+            final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
 
             if final_text in seen_predictions:
                 continue
@@ -713,17 +708,13 @@ def compute_predictions_log_probs(
             seen_predictions[final_text] = True
 
             nbest.append(
-                _NbestPrediction(
-                    text=final_text,
-                    start_log_prob=pred.start_log_prob,
-                    end_log_prob=pred.end_log_prob))
+                _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob)
+            )
 
         # In very rare edge cases we could have no valid predictions. So we
         # just create a nonce prediction in this case to avoid failure.
         if not nbest:
-            nbest.append(
-                _NbestPrediction(text="", start_log_prob=-1e6,
-                end_log_prob=-1e6))
+            nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6))
 
         total_scores = []
         best_non_null_entry = None
diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py
index 4f7307bb7b..e59e9fbcb2 100644
--- a/transformers/data/processors/__init__.py
+++ b/transformers/data/processors/__init__.py
@@ -1,4 +1,4 @@
 from .utils import InputExample, InputFeatures, DataProcessor, SingleSentenceClassificationProcessor
 from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
 from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor
-from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
\ No newline at end of file
+from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
diff --git a/transformers/data/processors/glue.py b/transformers/data/processors/glue.py
index 11ebd949de..f9c0132a71 100644
--- a/transformers/data/processors/glue.py
+++ b/transformers/data/processors/glue.py
@@ -27,15 +27,18 @@ if is_tf_available():
 logger = logging.getLogger(__name__)
 
 
-def glue_convert_examples_to_features(examples, tokenizer,
-                                      max_length=512,
-                                      task=None,
-                                      label_list=None,
-                                      output_mode=None,
-                                      pad_on_left=False,
-                                      pad_token=0,
-                                      pad_token_segment_id=0,
-                                      mask_padding_with_zero=True):
+def glue_convert_examples_to_features(
+    examples,
+    tokenizer,
+    max_length=512,
+    task=None,
+    label_list=None,
+    output_mode=None,
+    pad_on_left=False,
+    pad_token=0,
+    pad_token_segment_id=0,
+    mask_padding_with_zero=True,
+):
     """
     Loads a data file into a list of ``InputFeatures``
 
@@ -82,12 +85,7 @@ def glue_convert_examples_to_features(examples, tokenizer,
             example = processor.get_example_from_tensor_dict(example)
             example = processor.tfds_map(example)
 
-        inputs = tokenizer.encode_plus(
-            example.text_a,
-            example.text_b,
-            add_special_tokens=True,
-            max_length=max_length,
-        )
+        inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length,)
         input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
 
         # The mask has 1 for real tokens and 0 for padding tokens. Only real
@@ -106,8 +104,12 @@ def glue_convert_examples_to_features(examples, tokenizer,
             token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
 
         assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
-        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
-        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
+        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(
+            len(attention_mask), max_length
+        )
+        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(
+            len(token_type_ids), max_length
+        )
 
         if output_mode == "classification":
             label = label_map[example.label]
@@ -125,28 +127,36 @@ def glue_convert_examples_to_features(examples, tokenizer,
             logger.info("label: %s (id = %d)" % (example.label, label))
 
         features.append(
-                InputFeatures(input_ids=input_ids,
-                              attention_mask=attention_mask,
-                              token_type_ids=token_type_ids,
-                              label=label))
+            InputFeatures(
+                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label
+            )
+        )
 
     if is_tf_available() and is_tf_dataset:
+
         def gen():
             for ex in features:
-                yield ({'input_ids': ex.input_ids,
-                         'attention_mask': ex.attention_mask,
-                         'token_type_ids': ex.token_type_ids},
-                        ex.label)
+                yield (
+                    {
+                        "input_ids": ex.input_ids,
+                        "attention_mask": ex.attention_mask,
+                        "token_type_ids": ex.token_type_ids,
+                    },
+                    ex.label,
+                )
 
-        return tf.data.Dataset.from_generator(gen,
-            ({'input_ids': tf.int32,
-              'attention_mask': tf.int32,
-              'token_type_ids': tf.int32},
-             tf.int64),
-            ({'input_ids': tf.TensorShape([None]),
-              'attention_mask': tf.TensorShape([None]),
-              'token_type_ids': tf.TensorShape([None])},
-             tf.TensorShape([])))
+        return tf.data.Dataset.from_generator(
+            gen,
+            ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
+            (
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "token_type_ids": tf.TensorShape([None]),
+                },
+                tf.TensorShape([]),
+            ),
+        )
 
     return features
 
@@ -156,21 +166,21 @@ class MrpcProcessor(DataProcessor):
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
-                            tensor_dict['sentence1'].numpy().decode('utf-8'),
-                            tensor_dict['sentence2'].numpy().decode('utf-8'),
-                            str(tensor_dict['label'].numpy()))
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
 
     def get_train_examples(self, data_dir):
         """See base class."""
         logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -186,8 +196,7 @@ class MrpcProcessor(DataProcessor):
             text_a = line[3]
             text_b = line[4]
             label = line[0]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -196,21 +205,20 @@ class MnliProcessor(DataProcessor):
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
-                            tensor_dict['premise'].numpy().decode('utf-8'),
-                            tensor_dict['hypothesis'].numpy().decode('utf-8'),
-                            str(tensor_dict['label'].numpy()))
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["premise"].numpy().decode("utf-8"),
+            tensor_dict["hypothesis"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
-            "dev_matched")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")
 
     def get_labels(self):
         """See base class."""
@@ -226,8 +234,7 @@ class MnliProcessor(DataProcessor):
             text_a = line[8]
             text_b = line[9]
             label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -236,9 +243,7 @@ class MnliMismatchedProcessor(MnliProcessor):
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")),
-            "dev_matched")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched")
 
 
 class ColaProcessor(DataProcessor):
@@ -246,20 +251,20 @@ class ColaProcessor(DataProcessor):
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
-                            tensor_dict['sentence'].numpy().decode('utf-8'),
-                            None,
-                            str(tensor_dict['label'].numpy()))
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence"].numpy().decode("utf-8"),
+            None,
+            str(tensor_dict["label"].numpy()),
+        )
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -272,8 +277,7 @@ class ColaProcessor(DataProcessor):
             guid = "%s-%s" % (set_type, i)
             text_a = line[3]
             label = line[1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
         return examples
 
 
@@ -282,20 +286,20 @@ class Sst2Processor(DataProcessor):
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
-                            tensor_dict['sentence'].numpy().decode('utf-8'),
-                            None,
-                            str(tensor_dict['label'].numpy()))
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence"].numpy().decode("utf-8"),
+            None,
+            str(tensor_dict["label"].numpy()),
+        )
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -310,8 +314,7 @@ class Sst2Processor(DataProcessor):
             guid = "%s-%s" % (set_type, i)
             text_a = line[0]
             label = line[1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
         return examples
 
 
@@ -320,20 +323,20 @@ class StsbProcessor(DataProcessor):
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
-                            tensor_dict['sentence1'].numpy().decode('utf-8'),
-                            tensor_dict['sentence2'].numpy().decode('utf-8'),
-                            str(tensor_dict['label'].numpy()))
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -349,8 +352,7 @@ class StsbProcessor(DataProcessor):
             text_a = line[7]
             text_b = line[8]
             label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -359,20 +361,20 @@ class QqpProcessor(DataProcessor):
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
-                            tensor_dict['question1'].numpy().decode('utf-8'),
-                            tensor_dict['question2'].numpy().decode('utf-8'),
-                            str(tensor_dict['label'].numpy()))
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["question1"].numpy().decode("utf-8"),
+            tensor_dict["question2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -391,8 +393,7 @@ class QqpProcessor(DataProcessor):
                 label = line[5]
             except IndexError:
                 continue
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -401,21 +402,20 @@ class QnliProcessor(DataProcessor):
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
-                            tensor_dict['question'].numpy().decode('utf-8'),
-                            tensor_dict['sentence'].numpy().decode('utf-8'),
-                            str(tensor_dict['label'].numpy()))
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["question"].numpy().decode("utf-8"),
+            tensor_dict["sentence"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")),
-            "dev_matched")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
 
     def get_labels(self):
         """See base class."""
@@ -431,8 +431,7 @@ class QnliProcessor(DataProcessor):
             text_a = line[1]
             text_b = line[2]
             label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -441,20 +440,20 @@ class RteProcessor(DataProcessor):
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
-                            tensor_dict['sentence1'].numpy().decode('utf-8'),
-                            tensor_dict['sentence2'].numpy().decode('utf-8'),
-                            str(tensor_dict['label'].numpy()))
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -470,8 +469,7 @@ class RteProcessor(DataProcessor):
             text_a = line[1]
             text_b = line[2]
             label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -480,20 +478,20 @@ class WnliProcessor(DataProcessor):
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
-                            tensor_dict['sentence1'].numpy().decode('utf-8'),
-                            tensor_dict['sentence2'].numpy().decode('utf-8'),
-                            str(tensor_dict['label'].numpy()))
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -509,10 +507,10 @@ class WnliProcessor(DataProcessor):
             text_a = line[1]
             text_b = line[2]
             label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
+
 glue_tasks_num_labels = {
     "cola": 2,
     "mnli": 3,
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index fd5150e93f..efb10830bd 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -82,8 +82,8 @@ def _is_whitespace(c):
         return True
     return False
 
-def squad_convert_example_to_features(example, max_seq_length,
-                                       doc_stride, max_query_length, is_training):
+
+def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_query_length, is_training):
     features = []
     if is_training and not example.is_impossible:
         # Get start and end position
@@ -91,7 +91,7 @@ def squad_convert_example_to_features(example, max_seq_length,
         end_position = example.end_position
 
         # If the answer cannot be found in the text, then skip this example.
-        actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)])
+        actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
         cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
         if actual_text.find(cleaned_answer_text) == -1:
             logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
@@ -121,8 +121,11 @@ def squad_convert_example_to_features(example, max_seq_length,
     spans = []
 
     truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
-    sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence + 1 \
-        if 'roberta' in str(type(tokenizer)) else tokenizer.max_len - tokenizer.max_len_single_sentence
+    sequence_added_tokens = (
+        tokenizer.max_len - tokenizer.max_len_single_sentence + 1
+        if "roberta" in str(type(tokenizer))
+        else tokenizer.max_len - tokenizer.max_len_single_sentence
+    )
     sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
 
     span_doc_tokens = all_doc_tokens
@@ -135,16 +138,18 @@ def squad_convert_example_to_features(example, max_seq_length,
             return_overflowing_tokens=True,
             pad_to_max_length=True,
             stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
-            truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first'
+            truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
         )
 
-        paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride,
-                            max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
+        paragraph_len = min(
+            len(all_doc_tokens) - len(spans) * doc_stride,
+            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
+        )
 
-        if tokenizer.pad_token_id in encoded_dict['input_ids']:
-            non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)]
+        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
+            non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
         else:
-            non_padded_ids = encoded_dict['input_ids']
+            non_padded_ids = encoded_dict["input_ids"]
 
         tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
 
@@ -170,17 +175,20 @@ def squad_convert_example_to_features(example, max_seq_length,
     for doc_span_index in range(len(spans)):
         for j in range(spans[doc_span_index]["paragraph_len"]):
             is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
-            index = j if tokenizer.padding_side == "left" else spans[doc_span_index][
-                                                                   "truncated_query_with_special_tokens_length"] + j
+            index = (
+                j
+                if tokenizer.padding_side == "left"
+                else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+            )
             spans[doc_span_index]["token_is_max_context"][index] = is_max_context
 
     for span in spans:
         # Identify the position of the CLS token
-        cls_index = span['input_ids'].index(tokenizer.cls_token_id)
+        cls_index = span["input_ids"].index(tokenizer.cls_token_id)
 
         # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
         # Original TF implem also keep the classification token (set to 0) (not sure why...)
-        p_mask = np.array(span['token_type_ids'])
+        p_mask = np.array(span["token_type_ids"])
 
         p_mask = np.minimum(p_mask, 1)
 
@@ -219,31 +227,34 @@ def squad_convert_example_to_features(example, max_seq_length,
                 start_position = tok_start_position - doc_start + doc_offset
                 end_position = tok_end_position - doc_start + doc_offset
 
-        features.append(SquadFeatures(
-            span['input_ids'],
-            span['attention_mask'],
-            span['token_type_ids'],
-            cls_index,
-            p_mask.tolist(),
-            example_index=0, # Can not set unique_id and example_index here. They will be set after multiple processing.
-            unique_id=0,
-            paragraph_len=span['paragraph_len'],
-            token_is_max_context=span["token_is_max_context"],
-            tokens=span["tokens"],
-            token_to_orig_map=span["token_to_orig_map"],
-
-            start_position=start_position,
-            end_position=end_position
-        ))
+        features.append(
+            SquadFeatures(
+                span["input_ids"],
+                span["attention_mask"],
+                span["token_type_ids"],
+                cls_index,
+                p_mask.tolist(),
+                example_index=0,  # Can not set unique_id and example_index here. They will be set after multiple processing.
+                unique_id=0,
+                paragraph_len=span["paragraph_len"],
+                token_is_max_context=span["token_is_max_context"],
+                tokens=span["tokens"],
+                token_to_orig_map=span["token_to_orig_map"],
+                start_position=start_position,
+                end_position=end_position,
+            )
+        )
     return features
 
+
 def squad_convert_example_to_features_init(tokenizer_for_convert):
     global tokenizer
     tokenizer = tokenizer_for_convert
 
-def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                       doc_stride, max_query_length, is_training, 
-                                       return_dataset=False, threads=1):
+
+def squad_convert_examples_to_features(
+    examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False, threads=1
+):
     """
     Converts a list of examples into a list of features that can be directly given as input to a model.
     It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
@@ -279,17 +290,28 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         )
     """
 
-    # Defining helper methods    
+    # Defining helper methods
     features = []
     threads = min(threads, cpu_count())
     with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
-        annotate_ = partial(squad_convert_example_to_features, max_seq_length=max_seq_length,
-                                       doc_stride=doc_stride, max_query_length=max_query_length, is_training=is_training)
-        features = list(tqdm(p.imap(annotate_, examples, chunksize=32), total=len(examples), desc='convert squad examples to features'))
+        annotate_ = partial(
+            squad_convert_example_to_features,
+            max_seq_length=max_seq_length,
+            doc_stride=doc_stride,
+            max_query_length=max_query_length,
+            is_training=is_training,
+        )
+        features = list(
+            tqdm(
+                p.imap(annotate_, examples, chunksize=32),
+                total=len(examples),
+                desc="convert squad examples to features",
+            )
+        )
     new_features = []
     unique_id = 1000000000
     example_index = 0
-    for example_features in tqdm(features, total=len(features), desc='add example index and unique id'):
+    for example_features in tqdm(features, total=len(features), desc="add example index and unique id"):
         if not example_features:
             continue
         for example_feature in example_features:
@@ -300,7 +322,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         example_index += 1
     features = new_features
     del new_features
-    if return_dataset == 'pt':
+    if return_dataset == "pt":
         if not is_torch_available():
             raise ImportError("Pytorch must be installed to return a pytorch dataset.")
 
@@ -341,12 +363,13 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                         "input_ids": ex.input_ids,
                         "attention_mask": ex.attention_mask,
                         "token_type_ids": ex.token_type_ids,
-                    }, {
+                    },
+                    {
                         "start_position": ex.start_position,
                         "end_position": ex.end_position,
                         "cls_index": ex.cls_index,
                         "p_mask": ex.p_mask,
-                    }
+                    },
                 )
 
         return tf.data.Dataset.from_generator(
diff --git a/transformers/data/processors/utils.py b/transformers/data/processors/utils.py
index ee234e6e90..41cc00d4bd 100644
--- a/transformers/data/processors/utils.py
+++ b/transformers/data/processors/utils.py
@@ -24,6 +24,7 @@ from ...file_utils import is_tf_available, is_torch_available
 
 logger = logging.getLogger(__name__)
 
+
 class InputExample(object):
     """
     A single training/test example for simple sequence classification.
@@ -37,6 +38,7 @@ class InputExample(object):
         label: (Optional) string. The label of the example. This should be
         specified for train and dev examples, but not for test examples.
     """
+
     def __init__(self, guid, text_a, text_b=None, label=None):
         self.guid = guid
         self.text_a = text_a
@@ -99,14 +101,15 @@ class DataProcessor(object):
             lines = []
             for line in reader:
                 if sys.version_info[0] == 2:
-                    line = list(unicode(cell, 'utf-8') for cell in line)
+                    line = list(unicode(cell, "utf-8") for cell in line)
                 lines.append(line)
             return lines
 
 
 class SingleSentenceClassificationProcessor(DataProcessor):
     """ Generic processor for a single sentence classification data set."""
-    def __init__(self, labels=None, examples=None, mode='classification', verbose=False):
+
+    def __init__(self, labels=None, examples=None, mode="classification", verbose=False):
         self.labels = [] if labels is None else labels
         self.examples = [] if examples is None else examples
         self.mode = mode
@@ -117,22 +120,24 @@ class SingleSentenceClassificationProcessor(DataProcessor):
 
     def __getitem__(self, idx):
         if isinstance(idx, slice):
-            return SingleSentenceClassificationProcessor(labels=self.labels,
-                                                         examples=self.examples[idx])
+            return SingleSentenceClassificationProcessor(labels=self.labels, examples=self.examples[idx])
         return self.examples[idx]
 
     @classmethod
-    def create_from_csv(cls, file_name, split_name='', column_label=0, column_text=1,
-                        column_id=None, skip_first_row=False, **kwargs):
+    def create_from_csv(
+        cls, file_name, split_name="", column_label=0, column_text=1, column_id=None, skip_first_row=False, **kwargs
+    ):
         processor = cls(**kwargs)
-        processor.add_examples_from_csv(file_name,
-                                        split_name=split_name,
-                                        column_label=column_label,
-                                        column_text=column_text,
-                                        column_id=column_id,
-                                        skip_first_row=skip_first_row,
-                                        overwrite_labels=True,
-                                        overwrite_examples=True)
+        processor.add_examples_from_csv(
+            file_name,
+            split_name=split_name,
+            column_label=column_label,
+            column_text=column_text,
+            column_id=column_id,
+            skip_first_row=skip_first_row,
+            overwrite_labels=True,
+            overwrite_examples=True,
+        )
         return processor
 
     @classmethod
@@ -141,8 +146,17 @@ class SingleSentenceClassificationProcessor(DataProcessor):
         processor.add_examples(texts_or_text_and_labels, labels=labels)
         return processor
 
-    def add_examples_from_csv(self, file_name, split_name='', column_label=0, column_text=1, column_id=None,
-                              skip_first_row=False, overwrite_labels=False, overwrite_examples=False):
+    def add_examples_from_csv(
+        self,
+        file_name,
+        split_name="",
+        column_label=0,
+        column_text=1,
+        column_id=None,
+        skip_first_row=False,
+        overwrite_labels=False,
+        overwrite_examples=False,
+    ):
         lines = self._read_tsv(file_name)
         if skip_first_row:
             lines = lines[1:]
@@ -158,10 +172,13 @@ class SingleSentenceClassificationProcessor(DataProcessor):
                 guid = "%s-%s" % (split_name, i) if split_name else "%s" % i
                 ids.append(guid)
 
-        return self.add_examples(texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples)
+        return self.add_examples(
+            texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples
+        )
 
-    def add_examples(self, texts_or_text_and_labels, labels=None, ids=None,
-                     overwrite_labels=False, overwrite_examples=False):
+    def add_examples(
+        self, texts_or_text_and_labels, labels=None, ids=None, overwrite_labels=False, overwrite_examples=False
+    ):
         assert labels is None or len(texts_or_text_and_labels) == len(labels)
         assert ids is None or len(texts_or_text_and_labels) == len(ids)
         if ids is None:
@@ -192,13 +209,15 @@ class SingleSentenceClassificationProcessor(DataProcessor):
 
         return self.examples
 
-    def get_features(self,
-                     tokenizer,
-                     max_length=None,
-                     pad_on_left=False,
-                     pad_token=0,
-                     mask_padding_with_zero=True,
-                     return_tensors=None):
+    def get_features(
+        self,
+        tokenizer,
+        max_length=None,
+        pad_on_left=False,
+        pad_token=0,
+        mask_padding_with_zero=True,
+        return_tensors=None,
+    ):
         """
         Convert examples in a list of ``InputFeatures``
 
@@ -231,9 +250,7 @@ class SingleSentenceClassificationProcessor(DataProcessor):
                 logger.info("Tokenizing example %d", ex_index)
 
             input_ids = tokenizer.encode(
-                example.text_a,
-                add_special_tokens=True,
-                max_length=min(max_length, tokenizer.max_len),
+                example.text_a, add_special_tokens=True, max_length=min(max_length, tokenizer.max_len),
             )
             all_input_ids.append(input_ids)
 
@@ -256,8 +273,12 @@ class SingleSentenceClassificationProcessor(DataProcessor):
                 input_ids = input_ids + ([pad_token] * padding_length)
                 attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
 
-            assert len(input_ids) == batch_length, "Error with input length {} vs {}".format(len(input_ids), batch_length)
-            assert len(attention_mask) == batch_length, "Error with input length {} vs {}".format(len(attention_mask), batch_length)
+            assert len(input_ids) == batch_length, "Error with input length {} vs {}".format(
+                len(input_ids), batch_length
+            )
+            assert len(attention_mask) == batch_length, "Error with input length {} vs {}".format(
+                len(attention_mask), batch_length
+            )
 
             if self.mode == "classification":
                 label = label_map[example.label]
@@ -273,36 +294,31 @@ class SingleSentenceClassificationProcessor(DataProcessor):
                 logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
                 logger.info("label: %s (id = %d)" % (example.label, label))
 
-            features.append(
-                    InputFeatures(input_ids=input_ids,
-                                  attention_mask=attention_mask,
-                                  label=label))
+            features.append(InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=label))
 
         if return_tensors is None:
             return features
-        elif return_tensors == 'tf':
+        elif return_tensors == "tf":
             if not is_tf_available():
                 raise ImportError("return_tensors set to 'tf' but TensorFlow 2.0 can't be imported")
             import tensorflow as tf
+
             def gen():
                 for ex in features:
-                    yield  ({'input_ids': ex.input_ids,
-                            'attention_mask': ex.attention_mask},
-                            ex.label)
+                    yield ({"input_ids": ex.input_ids, "attention_mask": ex.attention_mask}, ex.label)
 
-            dataset = tf.data.Dataset.from_generator(gen,
-                    ({'input_ids': tf.int32,
-                    'attention_mask': tf.int32},
-                    tf.int64),
-                    ({'input_ids': tf.TensorShape([None]),
-                    'attention_mask': tf.TensorShape([None])},
-                    tf.TensorShape([])))
+            dataset = tf.data.Dataset.from_generator(
+                gen,
+                ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
+                ({"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])}, tf.TensorShape([])),
+            )
             return dataset
-        elif return_tensors == 'pt':
+        elif return_tensors == "pt":
             if not is_torch_available():
                 raise ImportError("return_tensors set to 'pt' but PyTorch can't be imported")
             import torch
             from torch.utils.data import TensorDataset
+
             all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
             all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
             if self.mode == "classification":
diff --git a/transformers/data/processors/xnli.py b/transformers/data/processors/xnli.py
index 958bdf62f9..ffe0358c1e 100644
--- a/transformers/data/processors/xnli.py
+++ b/transformers/data/processors/xnli.py
@@ -24,11 +24,12 @@ from .utils import DataProcessor, InputExample
 
 logger = logging.getLogger(__name__)
 
+
 class XnliProcessor(DataProcessor):
     """Processor for the XNLI dataset.
     Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207"""
 
-    def __init__(self, language, train_language = None):
+    def __init__(self, language, train_language=None):
         self.language = language
         self.train_language = train_language
 
@@ -40,13 +41,12 @@ class XnliProcessor(DataProcessor):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = "%s-%s" % ('train', i)
+            guid = "%s-%s" % ("train", i)
             text_a = line[0]
             text_b = line[1]
             label = "contradiction" if line[2] == "contradictory" else line[2]
             assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
     def get_test_examples(self, data_dir):
@@ -59,19 +59,19 @@ class XnliProcessor(DataProcessor):
             language = line[0]
             if language != self.language:
                 continue
-            guid = "%s-%s" % ('test', i)
+            guid = "%s-%s" % ("test", i)
             text_a = line[6]
             text_b = line[7]
             label = line[1]
             assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
     def get_labels(self):
         """See base class."""
         return ["contradiction", "entailment", "neutral"]
 
+
 xnli_processors = {
     "xnli": XnliProcessor,
 }
diff --git a/transformers/file_utils.py b/transformers/file_utils.py
index ec925c6160..c45bdee04a 100644
--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -3,7 +3,7 @@ Utilities for working with the local dataset cache.
 This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
 Copyright by the AllenNLP authors.
 """
-from __future__ import (absolute_import, division, print_function, unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import sys
 import json
@@ -29,9 +29,10 @@ from filelock import FileLock
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 try:
-    os.environ.setdefault('USE_TORCH', 'YES')
-    if os.environ['USE_TORCH'].upper() in ('1', 'ON', 'YES'):
+    os.environ.setdefault("USE_TORCH", "YES")
+    if os.environ["USE_TORCH"].upper() in ("1", "ON", "YES"):
         import torch
+
         _torch_available = True  # pylint: disable=invalid-name
         logger.info("PyTorch version {} available.".format(torch.__version__))
     else:
@@ -41,10 +42,11 @@ except ImportError:
     _torch_available = False  # pylint: disable=invalid-name
 
 try:
-    os.environ.setdefault('USE_TF', 'YES')
-    if os.environ['USE_TF'].upper() in ('1', 'ON', 'YES'):
+    os.environ.setdefault("USE_TF", "YES")
+    if os.environ["USE_TF"].upper() in ("1", "ON", "YES"):
         import tensorflow as tf
-        assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2
+
+        assert hasattr(tf, "__version__") and int(tf.__version__[0]) >= 2
         _tf_available = True  # pylint: disable=invalid-name
         logger.info("TensorFlow version {} available.".format(tf.__version__))
     else:
@@ -55,12 +57,13 @@ except (ImportError, AssertionError):
 
 try:
     from torch.hub import _get_torch_home
+
     torch_cache_home = _get_torch_home()
 except ImportError:
     torch_cache_home = os.path.expanduser(
-        os.getenv('TORCH_HOME', os.path.join(
-            os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
-default_cache_path = os.path.join(torch_cache_home, 'transformers')
+        os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
+    )
+default_cache_path = os.path.join(torch_cache_home, "transformers")
 
 try:
     from urllib.parse import urlparse
@@ -69,19 +72,21 @@ except ImportError:
 
 try:
     from pathlib import Path
+
     PYTORCH_PRETRAINED_BERT_CACHE = Path(
-        os.getenv('PYTORCH_TRANSFORMERS_CACHE', os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path)))
+        os.getenv("PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path))
+    )
 except (AttributeError, ImportError):
-    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_TRANSFORMERS_CACHE',
-                                              os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
-                                                        default_cache_path))
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv(
+        "PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
+    )
 
 PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
 TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
 
 WEIGHTS_NAME = "pytorch_model.bin"
-TF2_WEIGHTS_NAME = 'tf_model.h5'
-TF_WEIGHTS_NAME = 'model.ckpt'
+TF2_WEIGHTS_NAME = "tf_model.h5"
+TF_WEIGHTS_NAME = "model.ckpt"
 CONFIG_NAME = "config.json"
 MODEL_CARD_NAME = "modelcard.json"
 
@@ -95,38 +100,48 @@ CLOUDFRONT_DISTRIB_PREFIX = "https://d2ws9o8vfrpkyk.cloudfront.net"
 def is_torch_available():
     return _torch_available
 
+
 def is_tf_available():
 
     return _tf_available
 
+
 if not six.PY2:
+
     def add_start_docstrings(*docstr):
         def docstring_decorator(fn):
-            fn.__doc__ = ''.join(docstr) + fn.__doc__
+            fn.__doc__ = "".join(docstr) + fn.__doc__
             return fn
+
         return docstring_decorator
 
     def add_end_docstrings(*docstr):
         def docstring_decorator(fn):
-            fn.__doc__ = fn.__doc__ + ''.join(docstr)
+            fn.__doc__ = fn.__doc__ + "".join(docstr)
             return fn
+
         return docstring_decorator
+
+
 else:
     # Not possible to update class docstrings on python2
     def add_start_docstrings(*docstr):
         def docstring_decorator(fn):
             return fn
+
         return docstring_decorator
 
     def add_end_docstrings(*docstr):
         def docstring_decorator(fn):
             return fn
+
         return docstring_decorator
 
 
 def is_remote_url(url_or_filename):
     parsed = urlparse(url_or_filename)
-    return parsed.scheme in ('http', 'https', 's3')
+    return parsed.scheme in ("http", "https", "s3")
+
 
 def hf_bucket_url(identifier, postfix=None, cdn=False):
     endpoint = CLOUDFRONT_DISTRIB_PREFIX if cdn else S3_BUCKET_PREFIX
@@ -145,17 +160,17 @@ def url_to_filename(url, etag=None):
     so that TF 2.0 can identify it as a HDF5 file
     (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
     """
-    url_bytes = url.encode('utf-8')
+    url_bytes = url.encode("utf-8")
     url_hash = sha256(url_bytes)
     filename = url_hash.hexdigest()
 
     if etag:
-        etag_bytes = etag.encode('utf-8')
+        etag_bytes = etag.encode("utf-8")
         etag_hash = sha256(etag_bytes)
-        filename += '.' + etag_hash.hexdigest()
+        filename += "." + etag_hash.hexdigest()
 
-    if url.endswith('.h5'):
-        filename += '.h5'
+    if url.endswith(".h5"):
+        filename += ".h5"
 
     return filename
 
@@ -174,19 +189,21 @@ def filename_to_url(filename, cache_dir=None):
     if not os.path.exists(cache_path):
         raise EnvironmentError("file {} not found".format(cache_path))
 
-    meta_path = cache_path + '.json'
+    meta_path = cache_path + ".json"
     if not os.path.exists(meta_path):
         raise EnvironmentError("file {} not found".format(meta_path))
 
     with open(meta_path, encoding="utf-8") as meta_file:
         metadata = json.load(meta_file)
-    url = metadata['url']
-    etag = metadata['etag']
+    url = metadata["url"]
+    etag = metadata["etag"]
 
     return url, etag
 
 
-def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False, user_agent=None):
+def cached_path(
+    url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False, user_agent=None
+):
     """
     Given something that might be a URL (or might be a local path),
     determine which. If it's a URL, download the file and cache it, and
@@ -207,13 +224,18 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
 
     if is_remote_url(url_or_filename):
         # URL, so get it from the cache (downloading if necessary)
-        return get_from_cache(url_or_filename, cache_dir=cache_dir,
-            force_download=force_download, proxies=proxies,
-            resume_download=resume_download, user_agent=user_agent)
+        return get_from_cache(
+            url_or_filename,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            resume_download=resume_download,
+            user_agent=user_agent,
+        )
     elif os.path.exists(url_or_filename):
         # File, and it exists.
         return url_or_filename
-    elif urlparse(url_or_filename).scheme == '':
+    elif urlparse(url_or_filename).scheme == "":
         # File, but it doesn't exist.
         raise EnvironmentError("file {} not found".format(url_or_filename))
     else:
@@ -273,31 +295,35 @@ def s3_get(url, temp_file, proxies=None):
 def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
     ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
     if isinstance(user_agent, dict):
-        ua += "; " + "; ".join(
-            "{}/{}".format(k, v) for k, v in user_agent.items()
-        )
+        ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
     elif isinstance(user_agent, six.string_types):
-        ua += "; "+ user_agent
-    headers = {
-        "user-agent": ua
-    }
+        ua += "; " + user_agent
+    headers = {"user-agent": ua}
     if resume_size > 0:
-        headers['Range'] = 'bytes=%d-' % (resume_size,)
+        headers["Range"] = "bytes=%d-" % (resume_size,)
     response = requests.get(url, stream=True, proxies=proxies, headers=headers)
     if response.status_code == 416:  # Range not satisfiable
         return
-    content_length = response.headers.get('Content-Length')
+    content_length = response.headers.get("Content-Length")
     total = resume_size + int(content_length) if content_length is not None else None
-    progress = tqdm(unit="B", unit_scale=True, total=total, initial=resume_size,
-                    desc="Downloading", disable=bool(logger.level<=logging.INFO))
+    progress = tqdm(
+        unit="B",
+        unit_scale=True,
+        total=total,
+        initial=resume_size,
+        desc="Downloading",
+        disable=bool(logger.level <= logging.INFO),
+    )
     for chunk in response.iter_content(chunk_size=1024):
-        if chunk: # filter out keep-alive new chunks
+        if chunk:  # filter out keep-alive new chunks
             progress.update(len(chunk))
             temp_file.write(chunk)
     progress.close()
 
 
-def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False, user_agent=None):
+def get_from_cache(
+    url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False, user_agent=None
+):
     """
     Given a URL, look for the corresponding dataset in the local cache.
     If it's not there, download it. Then return the path to the cached file.
@@ -326,7 +352,7 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag
             etag = None
 
     if sys.version_info[0] == 2 and etag is not None:
-        etag = etag.decode('utf-8')
+        etag = etag.decode("utf-8")
     filename = url_to_filename(url, etag)
 
     # get cache path to put the file
@@ -337,22 +363,24 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag
     if not os.path.exists(cache_path) and etag is None:
         matching_files = [
             file
-            for file in fnmatch.filter(os.listdir(cache_dir), filename + '.*')
-            if not file.endswith('.json') and not file.endswith('.lock')
+            for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*")
+            if not file.endswith(".json") and not file.endswith(".lock")
         ]
         if matching_files:
             cache_path = os.path.join(cache_dir, matching_files[-1])
 
     # Prevent parallel downloads of the same file with a lock.
-    lock_path = cache_path + '.lock'
+    lock_path = cache_path + ".lock"
     with FileLock(lock_path):
 
         if resume_download:
-            incomplete_path = cache_path + '.incomplete'
+            incomplete_path = cache_path + ".incomplete"
+
             @contextmanager
             def _resumable_file_manager():
-                with open(incomplete_path,'a+b') as f:
+                with open(incomplete_path, "a+b") as f:
                     yield f
+
             temp_file_manager = _resumable_file_manager
             if os.path.exists(incomplete_path):
                 resume_size = os.stat(incomplete_path).st_size
@@ -366,7 +394,9 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag
             # Download to temporary file, then copy to cache dir once finished.
             # Otherwise you get corrupt cache entries if the download gets interrupted.
             with temp_file_manager() as temp_file:
-                logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
+                logger.info(
+                    "%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name
+                )
 
                 # GET file object
                 if url.startswith("s3://"):
@@ -383,12 +413,12 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag
                 os.rename(temp_file.name, cache_path)
 
                 logger.info("creating metadata file for %s", cache_path)
-                meta = {'url': url, 'etag': etag}
-                meta_path = cache_path + '.json'
-                with open(meta_path, 'w') as meta_file:
+                meta = {"url": url, "etag": etag}
+                meta_path = cache_path + ".json"
+                with open(meta_path, "w") as meta_file:
                     output_string = json.dumps(meta)
                     if sys.version_info[0] == 2 and isinstance(output_string, str):
-                        output_string = unicode(output_string, 'utf-8')  # The beauty of python 2
+                        output_string = unicode(output_string, "utf-8")  # The beauty of python 2
                     meta_file.write(output_string)
 
     return cache_path
diff --git a/transformers/hf_api.py b/transformers/hf_api.py
index 170732339a..81cc9f7ebb 100644
--- a/transformers/hf_api.py
+++ b/transformers/hf_api.py
@@ -24,13 +24,14 @@ from tqdm import tqdm
 
 ENDPOINT = "https://huggingface.co"
 
+
 class S3Obj:
     def __init__(
         self,
-        filename,     # type: str
-        LastModified, # type: str
-        ETag,         # type: str
-        Size,         # type: int
+        filename,  # type: str
+        LastModified,  # type: str
+        ETag,  # type: str
+        Size,  # type: int
         **kwargs
     ):
         self.filename = filename
@@ -43,13 +44,13 @@ class PresignedUrl:
     def __init__(
         self,
         write,  # type: str
-        access, # type: str
-        type,   # type: str
+        access,  # type: str
+        type,  # type: str
         **kwargs
     ):
         self.write = write
         self.access = access
-        self.type = type # mime-type to send to S3.
+        self.type = type  # mime-type to send to S3.
 
 
 class HfApi:
@@ -58,8 +59,8 @@ class HfApi:
 
     def login(
         self,
-        username, # type: str
-        password, # type: str
+        username,  # type: str
+        password,  # type: str
     ):
         # type: (...) -> str
         """
@@ -78,8 +79,7 @@ class HfApi:
         return d["token"]
 
     def whoami(
-        self,
-        token, # type: str
+        self, token,  # type: str
     ):
         # type: (...) -> str
         """
@@ -106,11 +106,7 @@ class HfApi:
         Call HF API to get a presigned url to upload `filename` to S3.
         """
         path = "{}/api/presign".format(self.endpoint)
-        r = requests.post(
-            path,
-            headers={"authorization": "Bearer {}".format(token)},
-            json={"filename": filename},
-        )
+        r = requests.post(path, headers={"authorization": "Bearer {}".format(token)}, json={"filename": filename},)
         r.raise_for_status()
         d = r.json()
         return PresignedUrl(**d)
@@ -126,16 +122,14 @@ class HfApi:
         urls = self.presign(token, filename=filename)
         # streaming upload:
         # https://2.python-requests.org/en/master/user/advanced/#streaming-uploads
-        # 
+        #
         # Even though we presign with the correct content-type,
         # the client still has to specify it when uploading the file.
         with open(filepath, "rb") as f:
             pf = TqdmProgressFileReader(f)
             data = f if pf.total_size > 0 else ""
 
-            r = requests.put(urls.write, data=data, headers={
-                "content-type": urls.type,
-            })
+            r = requests.put(urls.write, data=data, headers={"content-type": urls.type,})
             r.raise_for_status()
             pf.close()
         return urls.access
@@ -152,7 +146,6 @@ class HfApi:
         return [S3Obj(**x) for x in d]
 
 
-
 class TqdmProgressFileReader:
     """
     Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`)
@@ -161,12 +154,12 @@ class TqdmProgressFileReader:
     see github.com/huggingface/transformers/pull/2078#discussion_r354739608
     for implementation details.
     """
+
     def __init__(
-        self,
-        f   # type: io.BufferedReader
+        self, f  # type: io.BufferedReader
     ):
         self.f = f
-        self.total_size = os.fstat(f.fileno()).st_size # type: int
+        self.total_size = os.fstat(f.fileno()).st_size  # type: int
         self.pbar = tqdm(total=self.total_size, leave=False)
         if six.PY3:
             # does not work unless PY3
@@ -182,7 +175,6 @@ class TqdmProgressFileReader:
         self.pbar.close()
 
 
-
 class HfFolder:
     path_token = expanduser("~/.huggingface/token")
 
@@ -201,7 +193,7 @@ class HfFolder:
                 if e.errno != os.errno.EEXIST:
                     raise e
                 pass
-        with open(cls.path_token, 'w+') as f:
+        with open(cls.path_token, "w+") as f:
             f.write(token)
 
     @classmethod
@@ -210,7 +202,7 @@ class HfFolder:
         Get token or None if not existent.
         """
         try:
-            with open(cls.path_token, 'r') as f:
+            with open(cls.path_token, "r") as f:
                 return f.read()
         except:
             # this is too wide. When Py2 is dead use:
diff --git a/transformers/modelcard.py b/transformers/modelcard.py
index 4a879235ae..e6b1982e96 100644
--- a/transformers/modelcard.py
+++ b/transformers/modelcard.py
@@ -14,8 +14,7 @@
 # limitations under the License.
 """ Configuration base class and utilities."""
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import copy
 import json
@@ -25,8 +24,15 @@ from io import open
 
 from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, WEIGHTS_NAME, TF2_WEIGHTS_NAME, \
-                        cached_path, is_remote_url, hf_bucket_url
+from .file_utils import (
+    CONFIG_NAME,
+    MODEL_CARD_NAME,
+    WEIGHTS_NAME,
+    TF2_WEIGHTS_NAME,
+    cached_path,
+    is_remote_url,
+    hf_bucket_url,
+)
 
 
 logger = logging.getLogger(__name__)
@@ -48,17 +54,18 @@ class ModelCard(object):
 
         Parameters:
     """
+
     def __init__(self, **kwargs):
         # Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers)
-        self.model_details = kwargs.pop('model_details', {})
-        self.intended_use = kwargs.pop('intended_use', {})
-        self.factors = kwargs.pop('factors', {})
-        self.metrics = kwargs.pop('metrics', {})
-        self.evaluation_data = kwargs.pop('evaluation_data', {})
-        self.training_data = kwargs.pop('training_data', {})
-        self.quantitative_analyses = kwargs.pop('quantitative_analyses', {})
-        self.ethical_considerations = kwargs.pop('ethical_considerations', {})
-        self.caveats_and_recommendations = kwargs.pop('caveats_and_recommendations', {})
+        self.model_details = kwargs.pop("model_details", {})
+        self.intended_use = kwargs.pop("intended_use", {})
+        self.factors = kwargs.pop("factors", {})
+        self.metrics = kwargs.pop("metrics", {})
+        self.evaluation_data = kwargs.pop("evaluation_data", {})
+        self.training_data = kwargs.pop("training_data", {})
+        self.quantitative_analyses = kwargs.pop("quantitative_analyses", {})
+        self.ethical_considerations = kwargs.pop("ethical_considerations", {})
+        self.caveats_and_recommendations = kwargs.pop("caveats_and_recommendations", {})
 
         # Open additional attributes
         for key, value in kwargs.items():
@@ -122,10 +129,10 @@ class ModelCard(object):
             modelcard = ModelCard.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
 
         """
-        cache_dir = kwargs.pop('cache_dir', None)
-        proxies = kwargs.pop('proxies', None)
-        find_from_standard_name = kwargs.pop('find_from_standard_name', True)
-        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
+        cache_dir = kwargs.pop("cache_dir", None)
+        proxies = kwargs.pop("proxies", None)
+        find_from_standard_name = kwargs.pop("find_from_standard_name", True)
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
 
         if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
             # For simplicity we use the same pretrained url than the configuration files
@@ -145,36 +152,43 @@ class ModelCard(object):
 
         try:
             # Load from URL or cache if already cached
-            resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=True,
-                                               proxies=proxies, resume_download=False)
+            resolved_model_card_file = cached_path(
+                model_card_file, cache_dir=cache_dir, force_download=True, proxies=proxies, resume_download=False
+            )
             if resolved_model_card_file == model_card_file:
                 logger.info("loading model card file {}".format(model_card_file))
             else:
-                logger.info("loading model card file {} from cache at {}".format(
-                    model_card_file, resolved_model_card_file))
+                logger.info(
+                    "loading model card file {} from cache at {}".format(model_card_file, resolved_model_card_file)
+                )
             # Load model card
             modelcard = cls.from_json_file(resolved_model_card_file)
 
         except EnvironmentError:
             if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
-                logger.warning("Couldn't reach server at '{}' to download model card file.".format(
-                        model_card_file))
+                logger.warning("Couldn't reach server at '{}' to download model card file.".format(model_card_file))
             else:
-                logger.warning("Model name '{}' was not found in model name list ({}). " \
-                      "We assumed '{}' was a path or url to a model card file named {} or " \
-                      "a directory containing such a file but couldn't find any such file at this path or url.".format(
+                logger.warning(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url to a model card file named {} or "
+                    "a directory containing such a file but couldn't find any such file at this path or url.".format(
                         pretrained_model_name_or_path,
-                        ', '.join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
-                        model_card_file, MODEL_CARD_NAME))
+                        ", ".join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
+                        model_card_file,
+                        MODEL_CARD_NAME,
+                    )
+                )
             logger.warning("Creating an empty model card.")
 
             # We fall back on creating an empty model card
             modelcard = cls()
 
         except json.JSONDecodeError:
-            logger.warning("Couldn't reach server at '{}' to download model card file or "
-                           "model card file is not a valid JSON file. "
-                           "Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file))
+            logger.warning(
+                "Couldn't reach server at '{}' to download model card file or "
+                "model card file is not a valid JSON file. "
+                "Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file)
+            )
             logger.warning("Creating an empty model card.")
 
             # We fall back on creating an empty model card
@@ -203,7 +217,7 @@ class ModelCard(object):
     @classmethod
     def from_json_file(cls, json_file):
         """Constructs a `ModelCard` from a json file of parameters."""
-        with open(json_file, "r", encoding='utf-8') as reader:
+        with open(json_file, "r", encoding="utf-8") as reader:
             text = reader.read()
         dict_obj = json.loads(text)
         return cls(**dict_obj)
@@ -225,5 +239,5 @@ class ModelCard(object):
 
     def to_json_file(self, json_file_path):
         """ Save this instance to a json file."""
-        with open(json_file_path, "w", encoding='utf-8') as writer:
+        with open(json_file_path, "w", encoding="utf-8") as writer:
             writer.write(self.to_json_string())
diff --git a/transformers/modeling_albert.py b/transformers/modeling_albert.py
index f833b6d6bf..3d55bcd64d 100644
--- a/transformers/modeling_albert.py
+++ b/transformers/modeling_albert.py
@@ -1,4 +1,3 @@
-
 # coding=utf-8
 # Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
 #
@@ -30,14 +29,14 @@ logger = logging.getLogger(__name__)
 
 
 ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-pytorch_model.bin",
-    'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-pytorch_model.bin",
-    'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-pytorch_model.bin",
-    'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-pytorch_model.bin",
-    'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-pytorch_model.bin",
-    'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-pytorch_model.bin",
-    'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-pytorch_model.bin",
-    'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-pytorch_model.bin",
+    "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-pytorch_model.bin",
+    "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-pytorch_model.bin",
+    "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-pytorch_model.bin",
+    "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-pytorch_model.bin",
+    "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-pytorch_model.bin",
+    "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-pytorch_model.bin",
+    "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-pytorch_model.bin",
+    "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-pytorch_model.bin",
 }
 
 
@@ -48,8 +47,10 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise
     tf_path = os.path.abspath(tf_checkpoint_path)
     logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
@@ -65,7 +66,7 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
 
     for name, array in zip(names, arrays):
         print(name)
-    
+
     for name, array in zip(names, arrays):
         original_name = name
 
@@ -75,10 +76,10 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
         # Renaming and simplifying
         name = name.replace("ffn_1", "ffn")
         name = name.replace("bert/", "albert/")
-        name = name.replace("attention_1", "attention")   
+        name = name.replace("attention_1", "attention")
         name = name.replace("transform/", "")
-        name = name.replace("LayerNorm_1", "full_layer_layer_norm")    
-        name = name.replace("LayerNorm", "attention/LayerNorm")   
+        name = name.replace("LayerNorm_1", "full_layer_layer_norm")
+        name = name.replace("LayerNorm", "attention/LayerNorm")
         name = name.replace("transformer/", "")
 
         # The feed forward layer had an 'intermediate' step which has been abstracted away
@@ -97,19 +98,19 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
         name = name.replace("predictions/attention", "predictions")
 
         # Naming was changed to be more explicit
-        name = name.replace("embeddings/attention", "embeddings")    
-        name = name.replace("inner_group_", "albert_layers/") 
-        name = name.replace("group_", "albert_layer_groups/")   
+        name = name.replace("embeddings/attention", "embeddings")
+        name = name.replace("inner_group_", "albert_layers/")
+        name = name.replace("group_", "albert_layer_groups/")
 
         # Classifier
         if len(name.split("/")) == 1 and ("output_bias" in name or "output_weights" in name):
             name = "classifier/" + name
 
-        # No ALBERT model currently handles the next sentence prediction task 
+        # No ALBERT model currently handles the next sentence prediction task
         if "seq_relationship" in name:
             continue
 
-        name = name.split('/')
+        name = name.split("/")
 
         # Ignore the gradients applied by the LAMB/ADAM optimizers.
         if "adam_m" in name or "adam_v" in name or "global_step" in name:
@@ -118,19 +119,19 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
 
         pointer = model
         for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                l = re.split(r"_(\d+)", m_name)
             else:
                 l = [m_name]
 
-            if l[0] == 'kernel' or l[0] == 'gamma':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'squad':
-                pointer = getattr(pointer, 'classifier')
+            if l[0] == "kernel" or l[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif l[0] == "output_bias" or l[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif l[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif l[0] == "squad":
+                pointer = getattr(pointer, "classifier")
             else:
                 try:
                     pointer = getattr(pointer, l[0])
@@ -141,9 +142,9 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
                 num = int(l[1])
                 pointer = pointer[num]
 
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
             array = np.transpose(array)
         try:
             assert pointer.shape == array.shape
@@ -160,6 +161,7 @@ class AlbertEmbeddings(BertEmbeddings):
     """
     Construct the embeddings from word, position and token_type embeddings.
     """
+
     def __init__(self, config):
         super(AlbertEmbeddings, self).__init__(config)
 
@@ -175,7 +177,7 @@ class AlbertAttention(BertSelfAttention):
 
         self.output_attentions = config.output_attentions
         self.num_attention_heads = config.num_attention_heads
-        self.hidden_size = config.hidden_size 
+        self.hidden_size = config.hidden_size
         self.attention_head_size = config.hidden_size // config.num_attention_heads
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -237,10 +239,13 @@ class AlbertAttention(BertSelfAttention):
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         reshaped_context_layer = context_layer.view(*new_context_layer_shape)
-        
 
         # Should find a better way to do this
-        w = self.dense.weight.t().view(self.num_attention_heads, self.attention_head_size, self.hidden_size).to(context_layer.dtype)
+        w = (
+            self.dense.weight.t()
+            .view(self.num_attention_heads, self.attention_head_size, self.hidden_size)
+            .to(context_layer.dtype)
+        )
         b = self.dense.bias.to(context_layer.dtype)
 
         projected_context_layer = torch.einsum("bfnd,ndh->bfh", context_layer, w) + b
@@ -252,11 +257,11 @@ class AlbertAttention(BertSelfAttention):
 class AlbertLayer(nn.Module):
     def __init__(self, config):
         super(AlbertLayer, self).__init__()
-        
+
         self.config = config
         self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.attention = AlbertAttention(config)
-        self.ffn = nn.Linear(config.hidden_size, config.intermediate_size) 
+        self.ffn = nn.Linear(config.hidden_size, config.intermediate_size)
         self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size)
         self.activation = ACT2FN[config.hidden_act]
 
@@ -273,7 +278,7 @@ class AlbertLayer(nn.Module):
 class AlbertLayerGroup(nn.Module):
     def __init__(self, config):
         super(AlbertLayerGroup, self).__init__()
-        
+
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
         self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)])
@@ -303,7 +308,7 @@ class AlbertLayerGroup(nn.Module):
 class AlbertTransformer(nn.Module):
     def __init__(self, config):
         super(AlbertTransformer, self).__init__()
-        
+
         self.config = config
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
@@ -327,8 +332,12 @@ class AlbertTransformer(nn.Module):
 
             # Index of the layer inside the group
             layer_idx = int(i - group_idx * layers_per_group)
-            
-            layer_group_output = self.albert_layer_groups[group_idx](hidden_states, attention_mask, head_mask[group_idx*layers_per_group:(group_idx+1)*layers_per_group])  
+
+            layer_group_output = self.albert_layer_groups[group_idx](
+                hidden_states,
+                attention_mask,
+                head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
+            )
             hidden_states = layer_group_output[0]
 
             if self.output_attentions:
@@ -337,7 +346,6 @@ class AlbertTransformer(nn.Module):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
-        
         outputs = (hidden_states,)
         if self.output_hidden_states:
             outputs = outputs + (all_hidden_states,)
@@ -346,11 +354,11 @@ class AlbertTransformer(nn.Module):
         return outputs  # last-layer hidden state, (all hidden states), (all attentions)
 
 
-
 class AlbertPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = AlbertConfig
     pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "albert"
@@ -431,8 +439,12 @@ ALBERT_INPUTS_DOCSTRING = r"""
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
 
-@add_start_docstrings("The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
-                      ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
+    ALBERT_START_DOCSTRING,
+    ALBERT_INPUTS_DOCSTRING,
+)
 class AlbertModel(AlbertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -500,8 +512,15 @@ class AlbertModel(AlbertPreTrainedModel):
             inner_group_idx = int(layer - group_idx * self.config.inner_group_num)
             self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
 
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -520,31 +539,37 @@ class AlbertModel(AlbertPreTrainedModel):
             token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
 
         extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
         if head_mask is not None:
             if head_mask.dim() == 1:
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.num_hidden_layers
 
-        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                                           inputs_embeds=inputs_embeds)
-        encoder_outputs = self.encoder(embedding_output,
-                                       extended_attention_mask,
-                                       head_mask=head_mask)
+        embedding_output = self.embeddings(
+            input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+        encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask)
 
         sequence_output = encoder_outputs[0]
 
         pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0]))
 
-        outputs = (sequence_output, pooled_output) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        outputs = (sequence_output, pooled_output) + encoder_outputs[
+            1:
+        ]  # add hidden_states and attentions if they are here
         return outputs
 
+
 class AlbertMLMHead(nn.Module):
     def __init__(self, config):
         super(AlbertMLMHead, self).__init__()
@@ -566,7 +591,9 @@ class AlbertMLMHead(nn.Module):
         return prediction_scores
 
 
-@add_start_docstrings("Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    "Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING
+)
 class AlbertForMaskedLM(AlbertPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -602,21 +629,28 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
         """ Make sure we are sharing the input and output embeddings.
             Export to TorchScript can't handle parameter sharing so we are cloning them instead.
         """
-        self._tie_or_clone_weights(self.predictions.decoder,
-                                   self.albert.embeddings.word_embeddings)
+        self._tie_or_clone_weights(self.predictions.decoder, self.albert.embeddings.word_embeddings)
 
     def get_output_embeddings(self):
         return self.predictions.decoder
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                masked_lm_labels=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        masked_lm_labels=None,
+    ):
         outputs = self.albert(
             input_ids=input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
             position_ids=position_ids,
             head_mask=head_mask,
-            inputs_embeds=inputs_embeds
+            inputs_embeds=inputs_embeds,
         )
         sequence_outputs = outputs[0]
 
@@ -631,9 +665,12 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
         return outputs
 
 
-@add_start_docstrings("""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+    ALBERT_START_DOCSTRING,
+    ALBERT_INPUTS_DOCSTRING,
+)
 class AlbertForSequenceClassification(AlbertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -665,6 +702,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(AlbertForSequenceClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -675,8 +713,16 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
 
         outputs = self.albert(
             input_ids=input_ids,
@@ -684,7 +730,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
             token_type_ids=token_type_ids,
             position_ids=position_ids,
             head_mask=head_mask,
-            inputs_embeds=inputs_embeds
+            inputs_embeds=inputs_embeds,
         )
 
         pooled_output = outputs[1]
@@ -707,10 +753,12 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
         return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
-
-@add_start_docstrings("""Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+    ALBERT_START_DOCSTRING,
+    ALBERT_INPUTS_DOCSTRING,
+)
 class AlbertForQuestionAnswering(AlbertPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -752,6 +800,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
 
 
     """
+
     def __init__(self, config):
         super(AlbertForQuestionAnswering, self).__init__(config)
         self.num_labels = config.num_labels
@@ -761,8 +810,17 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                inputs_embeds=None, start_positions=None, end_positions=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+    ):
 
         outputs = self.albert(
             input_ids=input_ids,
@@ -770,7 +828,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
             token_type_ids=token_type_ids,
             position_ids=position_ids,
             head_mask=head_mask,
-            inputs_embeds=inputs_embeds
+            inputs_embeds=inputs_embeds,
         )
 
         sequence_output = outputs[0]
diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py
index 6b49efd378..31e9ee6bd2 100644
--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -18,31 +18,87 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
-from .configuration_auto import (AlbertConfig, BertConfig, CamembertConfig, CTRLConfig,
-                                 DistilBertConfig, GPT2Config, OpenAIGPTConfig, RobertaConfig,
-                                 TransfoXLConfig, XLMConfig, XLNetConfig, XLMRobertaConfig)
+from .configuration_auto import (
+    AlbertConfig,
+    BertConfig,
+    CamembertConfig,
+    CTRLConfig,
+    DistilBertConfig,
+    GPT2Config,
+    OpenAIGPTConfig,
+    RobertaConfig,
+    TransfoXLConfig,
+    XLMConfig,
+    XLNetConfig,
+    XLMRobertaConfig,
+)
 
-from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering, \
-    BertForTokenClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_bert import (
+    BertModel,
+    BertForMaskedLM,
+    BertForSequenceClassification,
+    BertForQuestionAnswering,
+    BertForTokenClassification,
+    BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
 from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_ctrl import CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering, \
-    XLNetForTokenClassification, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering, \
-    XLM_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, \
-    RobertaForTokenClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, \
-    DistilBertForSequenceClassification, DistilBertForTokenClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, \
-    CamembertForMultipleChoice, CamembertForTokenClassification, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, \
-    AlbertForQuestionAnswering, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_xlnet import (
+    XLNetModel,
+    XLNetLMHeadModel,
+    XLNetForSequenceClassification,
+    XLNetForQuestionAnswering,
+    XLNetForTokenClassification,
+    XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
+from .modeling_xlm import (
+    XLMModel,
+    XLMWithLMHeadModel,
+    XLMForSequenceClassification,
+    XLMForQuestionAnswering,
+    XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
+from .modeling_roberta import (
+    RobertaModel,
+    RobertaForMaskedLM,
+    RobertaForSequenceClassification,
+    RobertaForTokenClassification,
+    ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
+from .modeling_distilbert import (
+    DistilBertModel,
+    DistilBertForQuestionAnswering,
+    DistilBertForMaskedLM,
+    DistilBertForSequenceClassification,
+    DistilBertForTokenClassification,
+    DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
+from .modeling_camembert import (
+    CamembertModel,
+    CamembertForMaskedLM,
+    CamembertForSequenceClassification,
+    CamembertForMultipleChoice,
+    CamembertForTokenClassification,
+    CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
+from .modeling_albert import (
+    AlbertModel,
+    AlbertForMaskedLM,
+    AlbertForSequenceClassification,
+    AlbertForQuestionAnswering,
+    ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
 from .modeling_t5 import T5Model, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_xlm_roberta import XLMRobertaModel, XLMRobertaForMaskedLM, XLMRobertaForSequenceClassification, \
-    XLMRobertaForMultipleChoice, XLMRobertaForTokenClassification, XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_xlm_roberta import (
+    XLMRobertaModel,
+    XLMRobertaForMaskedLM,
+    XLMRobertaForSequenceClassification,
+    XLMRobertaForMultipleChoice,
+    XLMRobertaForTokenClassification,
+    XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
 
 from .modeling_utils import PreTrainedModel, SequenceSummary
 
@@ -51,7 +107,8 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 
-ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
+ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict(
+    (key, value)
     for pretrained_map in [
         BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
@@ -66,8 +123,9 @@ ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
         CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         T5_PRETRAINED_MODEL_ARCHIVE_MAP,
         XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-        ]
-    for key, value, in pretrained_map.items())
+    ]
+    for key, value, in pretrained_map.items()
+)
 
 
 class AutoModel(object):
@@ -98,10 +156,13 @@ class AutoModel(object):
 
         This class cannot be instantiated using `__init__()` (throws an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("AutoModel is designed to be instantiated "
+        raise EnvironmentError(
+            "AutoModel is designed to be instantiated "
             "using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModel.from_config(config)` methods.")
+            "`AutoModel.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -232,35 +293,39 @@ class AutoModel(object):
             model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 't5' in pretrained_model_name_or_path:
+        if "t5" in pretrained_model_name_or_path:
             return T5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
+        elif "distilbert" in pretrained_model_name_or_path:
             return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
             return AlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'camembert' in pretrained_model_name_or_path:
+        elif "camembert" in pretrained_model_name_or_path:
             return CamembertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm-roberta' in pretrained_model_name_or_path:
+        elif "xlm-roberta" in pretrained_model_name_or_path:
             return XLMRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        elif "roberta" in pretrained_model_name_or_path:
             return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
             return BertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'openai-gpt' in pretrained_model_name_or_path:
+        elif "openai-gpt" in pretrained_model_name_or_path:
             return OpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'gpt2' in pretrained_model_name_or_path:
+        elif "gpt2" in pretrained_model_name_or_path:
             return GPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'transfo-xl' in pretrained_model_name_or_path:
+        elif "transfo-xl" in pretrained_model_name_or_path:
             return TransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
             return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'ctrl' in pretrained_model_name_or_path:
+        elif "ctrl" in pretrained_model_name_or_path:
             return CTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm-roberta', 'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+            "'xlm-roberta', 'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(
+                pretrained_model_name_or_path
+            )
+        )
 
 
 class AutoModelWithLMHead(object):
@@ -291,10 +356,13 @@ class AutoModelWithLMHead(object):
 
         This class cannot be instantiated using `__init__()` (throws an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
+        raise EnvironmentError(
+            "AutoModelWithLMHead is designed to be instantiated "
             "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModelWithLMHead.from_config(config)` methods.")
+            "`AutoModelWithLMHead.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -423,35 +491,39 @@ class AutoModelWithLMHead(object):
             model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 't5' in pretrained_model_name_or_path:
+        if "t5" in pretrained_model_name_or_path:
             return T5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
+        elif "distilbert" in pretrained_model_name_or_path:
             return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
             return AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'camembert' in pretrained_model_name_or_path:
+        elif "camembert" in pretrained_model_name_or_path:
             return CamembertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm-roberta' in pretrained_model_name_or_path:
+        elif "xlm-roberta" in pretrained_model_name_or_path:
             return XLMRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        elif "roberta" in pretrained_model_name_or_path:
             return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
             return BertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'openai-gpt' in pretrained_model_name_or_path:
+        elif "openai-gpt" in pretrained_model_name_or_path:
             return OpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'gpt2' in pretrained_model_name_or_path:
+        elif "gpt2" in pretrained_model_name_or_path:
             return GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'transfo-xl' in pretrained_model_name_or_path:
+        elif "transfo-xl" in pretrained_model_name_or_path:
             return TransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return XLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
             return XLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'ctrl' in pretrained_model_name_or_path:
+        elif "ctrl" in pretrained_model_name_or_path:
             return CTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm-roberta', 'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+            "'xlm-roberta', 'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(
+                pretrained_model_name_or_path
+            )
+        )
 
 
 class AutoModelForSequenceClassification(object):
@@ -477,10 +549,13 @@ class AutoModelForSequenceClassification(object):
 
         This class cannot be instantiated using `__init__()` (throws an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("AutoModelForSequenceClassification is designed to be instantiated "
+        raise EnvironmentError(
+            "AutoModelForSequenceClassification is designed to be instantiated "
             "using the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModelForSequenceClassification.from_config(config)` methods.")
+            "`AutoModelForSequenceClassification.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -597,25 +672,39 @@ class AutoModelForSequenceClassification(object):
             model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
-            return DistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
-            return AlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'camembert' in pretrained_model_name_or_path:
-            return CamembertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm-roberta' in pretrained_model_name_or_path:
-            return XLMRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
-            return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        if "distilbert" in pretrained_model_name_or_path:
+            return DistilBertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "albert" in pretrained_model_name_or_path:
+            return AlbertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "camembert" in pretrained_model_name_or_path:
+            return CamembertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "xlm-roberta" in pretrained_model_name_or_path:
+            return XLMRobertaForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "roberta" in pretrained_model_name_or_path:
+            return RobertaForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "bert" in pretrained_model_name_or_path:
             return BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return XLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
             return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'xlnet', 'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(
+                pretrained_model_name_or_path
+            )
+        )
 
 
 class AutoModelForQuestionAnswering(object):
@@ -638,10 +727,13 @@ class AutoModelForQuestionAnswering(object):
 
         This class cannot be instantiated using `__init__()` (throws an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("AutoModelForQuestionAnswering is designed to be instantiated "
+        raise EnvironmentError(
+            "AutoModelForQuestionAnswering is designed to be instantiated "
             "using the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModelForQuestionAnswering.from_config(config)` methods.")
+            "`AutoModelForQuestionAnswering.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -745,26 +837,30 @@ class AutoModelForQuestionAnswering(object):
             model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if "distilbert" in pretrained_model_name_or_path:
             return DistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
             return AlbertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
             return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return XLNetForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
             return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'xlm', 'distilbert', 'albert'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'xlnet', 'xlm', 'distilbert', 'albert'".format(pretrained_model_name_or_path)
+        )
 
 
 class AutoModelForTokenClassification:
     def __init__(self):
-        raise EnvironmentError("AutoModelForTokenClassification is designed to be instantiated "
-                               "using the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
-                               "`AutoModelForTokenClassification.from_config(config)` methods.")
+        raise EnvironmentError(
+            "AutoModelForTokenClassification is designed to be instantiated "
+            "using the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
+            "`AutoModelForTokenClassification.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -797,7 +893,7 @@ class AutoModelForTokenClassification:
         elif isinstance(config, XLMRobertaConfig):
             return XLMRobertaForTokenClassification(config)
         raise ValueError("Unrecognized configuration class {}".format(config))
-    
+
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         r""" Instantiates one of the question answering model classes of the library
@@ -870,18 +966,28 @@ class AutoModelForTokenClassification:
             model = AutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 'camembert' in pretrained_model_name_or_path:
-            return CamembertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
-            return DistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm-roberta' in pretrained_model_name_or_path:
-            return XLMRobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        if "camembert" in pretrained_model_name_or_path:
+            return CamembertForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "distilbert" in pretrained_model_name_or_path:
+            return DistilBertForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "xlm-roberta" in pretrained_model_name_or_path:
+            return XLMRobertaForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "roberta" in pretrained_model_name_or_path:
             return RobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
             return BertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return XLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'camembert', 'distilbert', 'xlm-roberta', 'roberta'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'xlnet', 'camembert', 'distilbert', 'xlm-roberta', 'roberta'".format(
+                pretrained_model_name_or_path
+            )
+        )
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index ca07a81aea..0994e832de 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -33,27 +33,27 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
-    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
-    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
-    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
-    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
-    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
-    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
-    'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
-    'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
-    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
-    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
-    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
-    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
-    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
-    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
+    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
+    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
+    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
+    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
+    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
+    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
+    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
+    "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
+    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
+    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
+    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
+    "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
+    "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
+    "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
+    "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
+    "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
+    "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
+    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
+    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
 }
 
 
@@ -65,8 +65,10 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise
     tf_path = os.path.abspath(tf_checkpoint_path)
     logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
@@ -81,7 +83,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
         arrays.append(array)
 
     for name, array in zip(names, arrays):
-        name = name.split('/')
+        name = name.split("/")
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
@@ -89,18 +91,18 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
             continue
         pointer = model
         for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                l = re.split(r"_(\d+)", m_name)
             else:
                 l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'squad':
-                pointer = getattr(pointer, 'classifier')
+            if l[0] == "kernel" or l[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif l[0] == "output_bias" or l[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif l[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif l[0] == "squad":
+                pointer = getattr(pointer, "classifier")
             else:
                 try:
                     pointer = getattr(pointer, l[0])
@@ -110,9 +112,9 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
             if len(l) >= 2:
                 num = int(l[1])
                 pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
             array = np.transpose(array)
         try:
             assert pointer.shape == array.shape
@@ -157,6 +159,7 @@ BertLayerNorm = torch.nn.LayerNorm
 class BertEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings.
     """
+
     def __init__(self, config):
         super(BertEmbeddings, self).__init__()
         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
@@ -199,7 +202,8 @@ class BertSelfAttention(nn.Module):
         if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
         self.output_attentions = config.output_attentions
 
         self.num_attention_heads = config.num_attention_heads
@@ -217,7 +221,14 @@ class BertSelfAttention(nn.Module):
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
         mixed_query_layer = self.query(hidden_states)
 
         # If this is instantiated as a cross-attention module, the keys
@@ -307,8 +318,17 @@ class BertAttention(nn.Module):
         self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
-        self_outputs = self.self(hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
+        self_outputs = self.self(
+            hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask
+        )
         attention_output = self.output(self_outputs[0], hidden_states)
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
@@ -353,13 +373,22 @@ class BertLayer(nn.Module):
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
         self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
         attention_output = self_attention_outputs[0]
         outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
 
         if self.is_decoder and encoder_hidden_states is not None:
-            cross_attention_outputs = self.crossattention(attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask)
+            cross_attention_outputs = self.crossattention(
+                attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask
+            )
             attention_output = cross_attention_outputs[0]
             outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
 
@@ -376,14 +405,23 @@ class BertEncoder(nn.Module):
         self.output_hidden_states = config.output_hidden_states
         self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
         all_hidden_states = ()
         all_attentions = ()
         for i, layer_module in enumerate(self.layer):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
-            layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask)
+            layer_outputs = layer_module(
+                hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask
+            )
             hidden_states = layer_outputs[0]
 
             if self.output_attentions:
@@ -440,9 +478,7 @@ class BertLMPredictionHead(nn.Module):
 
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size,
-                                 config.vocab_size,
-                                 bias=False)
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 
         self.bias = nn.Parameter(torch.zeros(config.vocab_size))
 
@@ -488,6 +524,7 @@ class BertPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = BertConfig
     pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_bert
@@ -581,8 +618,12 @@ BERT_INPUTS_DOCSTRING = r"""
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
 """
 
-@add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
-                      BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class BertModel(BertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -612,6 +653,7 @@ class BertModel(BertPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config):
         super(BertModel, self).__init__(config)
         self.config = config
@@ -636,8 +678,17 @@ class BertModel(BertPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None,
-                head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
         """ Forward pass on the Model.
 
         The model can behave as an encoder (with only self-attention) as well
@@ -681,12 +732,18 @@ class BertModel(BertPreTrainedModel):
                 batch_size, seq_length = input_shape
                 seq_ids = torch.arange(seq_length, device=device)
                 causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
-                causal_mask = causal_mask.to(torch.long)  # not converting to long will cause errors with pytorch version < 1.3
+                causal_mask = causal_mask.to(
+                    torch.long
+                )  # not converting to long will cause errors with pytorch version < 1.3
                 extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
             else:
                 extended_attention_mask = attention_mask[:, None, None, :]
         else:
-            raise ValueError("Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(input_shape, attention_mask.shape))
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
 
         # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
         # masked positions, this operation will create a tensor which is 0.0 for
@@ -709,10 +766,15 @@ class BertModel(BertPreTrainedModel):
             elif encoder_attention_mask.dim() == 2:
                 encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
             else:
-                raise ValueError("Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format(encoder_hidden_shape,
-                                                                                                                               encoder_attention_mask.shape))
+                raise ValueError(
+                    "Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format(
+                        encoder_hidden_shape, encoder_attention_mask.shape
+                    )
+                )
 
-            encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+            encoder_extended_attention_mask = encoder_extended_attention_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # fp16 compatibility
             encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
         else:
             encoder_extended_attention_mask = None
@@ -727,28 +789,40 @@ class BertModel(BertPreTrainedModel):
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype)  # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.num_hidden_layers
 
-        embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds)
-        encoder_outputs = self.encoder(embedding_output,
-                                       attention_mask=extended_attention_mask,
-                                       head_mask=head_mask,
-                                       encoder_hidden_states=encoder_hidden_states,
-                                       encoder_attention_mask=encoder_extended_attention_mask)
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+        )
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output)
 
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[
+            1:
+        ]  # add hidden_states and attentions if they are here
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with two heads on top as done during the pre-training:
+@add_start_docstrings(
+    """Bert Model with two heads on top as done during the pre-training:
                        a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
-                      BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class BertForPreTraining(BertPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -786,6 +860,7 @@ class BertForPreTraining(BertPreTrainedModel):
         prediction_scores, seq_relationship_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(BertForPreTraining, self).__init__(config)
 
@@ -797,20 +872,33 @@ class BertForPreTraining(BertPreTrainedModel):
     def get_output_embeddings(self):
         return self.cls.predictions.decoder
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                masked_lm_labels=None, next_sentence_label=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        masked_lm_labels=None,
+        next_sentence_label=None,
+    ):
 
-        outputs = self.bert(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output, pooled_output = outputs[:2]
         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
 
-        outputs = (prediction_scores, seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
+        outputs = (prediction_scores, seq_relationship_score,) + outputs[
+            2:
+        ]  # add hidden states and attention if they are here
 
         if masked_lm_labels is not None and next_sentence_label is not None:
             loss_fct = CrossEntropyLoss()
@@ -822,9 +910,9 @@ class BertForPreTraining(BertPreTrainedModel):
         return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a `language modeling` head on top. """,
-                      BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING
+)
 class BertForMaskedLM(BertPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -862,6 +950,7 @@ class BertForMaskedLM(BertPreTrainedModel):
         loss, prediction_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(BertForMaskedLM, self).__init__(config)
 
@@ -873,17 +962,30 @@ class BertForMaskedLM(BertPreTrainedModel):
     def get_output_embeddings(self):
         return self.cls.predictions.decoder
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, lm_labels=None, ):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        masked_lm_labels=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        lm_labels=None,
+    ):
 
-        outputs = self.bert(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds,
-                            encoder_hidden_states=encoder_hidden_states,
-                            encoder_attention_mask=encoder_attention_mask)
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
 
         sequence_output = outputs[0]
         prediction_scores = self.cls(sequence_output)
@@ -912,9 +1014,11 @@ class BertForMaskedLM(BertPreTrainedModel):
         return outputs  # (masked_lm_loss), (ltr_lm_loss), prediction_scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,
-                      BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class BertForNextSentencePrediction(BertPreTrainedModel):
     r"""
         **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -945,6 +1049,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
         seq_relationship_scores = outputs[0]
 
     """
+
     def __init__(self, config):
         super(BertForNextSentencePrediction, self).__init__(config)
 
@@ -953,15 +1058,25 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                next_sentence_label=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        next_sentence_label=None,
+    ):
 
-        outputs = self.bert(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         pooled_output = outputs[1]
 
@@ -976,10 +1091,12 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
         return outputs  # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
                       the pooled output) e.g. for GLUE tasks. """,
-                      BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class BertForSequenceClassification(BertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -1011,6 +1128,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(BertForSequenceClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -1021,15 +1139,25 @@ class BertForSequenceClassification(BertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
 
-        outputs = self.bert(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         pooled_output = outputs[1]
 
@@ -1051,10 +1179,12 @@ class BertForSequenceClassification(BertPreTrainedModel):
         return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Bert Model with a multiple choice classification head on top (a linear layer on top of
                       the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-                      BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class BertForMultipleChoice(BertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -1087,6 +1217,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
         loss, classification_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(BertForMultipleChoice, self).__init__(config)
 
@@ -1096,8 +1227,16 @@ class BertForMultipleChoice(BertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
         num_choices = input_ids.shape[1]
 
         input_ids = input_ids.view(-1, input_ids.size(-1))
@@ -1105,12 +1244,14 @@ class BertForMultipleChoice(BertPreTrainedModel):
         token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
         position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
 
-        outputs = self.bert(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         pooled_output = outputs[1]
 
@@ -1128,10 +1269,12 @@ class BertForMultipleChoice(BertPreTrainedModel):
         return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Bert Model with a token classification head on top (a linear layer on top of
                       the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-                      BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class BertForTokenClassification(BertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -1161,6 +1304,7 @@ class BertForTokenClassification(BertPreTrainedModel):
         loss, scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(BertForTokenClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -1171,15 +1315,25 @@ class BertForTokenClassification(BertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
 
-        outputs = self.bert(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output = outputs[0]
 
@@ -1202,10 +1356,12 @@ class BertForTokenClassification(BertPreTrainedModel):
         return outputs  # (loss), scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
                       the hidden-states output to compute `span start logits` and `span end logits`). """,
-                      BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class BertForQuestionAnswering(BertPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -1247,6 +1403,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
 
 
     """
+
     def __init__(self, config):
         super(BertForQuestionAnswering, self).__init__(config)
         self.num_labels = config.num_labels
@@ -1256,15 +1413,26 @@ class BertForQuestionAnswering(BertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                start_positions=None, end_positions=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+    ):
 
-        outputs = self.bert(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output = outputs[0]
 
diff --git a/transformers/modeling_camembert.py b/transformers/modeling_camembert.py
index 1b808bfd82..2a7a7a7332 100644
--- a/transformers/modeling_camembert.py
+++ b/transformers/modeling_camembert.py
@@ -15,19 +15,24 @@
 # limitations under the License.
 """PyTorch CamemBERT model. """
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 
-from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForMultipleChoice, RobertaForTokenClassification
+from .modeling_roberta import (
+    RobertaModel,
+    RobertaForMaskedLM,
+    RobertaForSequenceClassification,
+    RobertaForMultipleChoice,
+    RobertaForTokenClassification,
+)
 from .configuration_camembert import CamembertConfig
 from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
 CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-pytorch_model.bin",
+    "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-pytorch_model.bin",
 }
 
 
@@ -100,8 +105,12 @@ CAMEMBERT_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
-                      CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
+    CAMEMBERT_START_DOCSTRING,
+    CAMEMBERT_INPUTS_DOCSTRING,
+)
 class CamembertModel(RobertaModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -149,8 +158,11 @@ class CamembertModel(RobertaModel):
     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
-@add_start_docstrings("""CamemBERT Model with a `language modeling` head on top. """,
-    CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """CamemBERT Model with a `language modeling` head on top. """,
+    CAMEMBERT_START_DOCSTRING,
+    CAMEMBERT_INPUTS_DOCSTRING,
+)
 class CamembertForMaskedLM(RobertaForMaskedLM):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -185,9 +197,12 @@ class CamembertForMaskedLM(RobertaForMaskedLM):
     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
-@add_start_docstrings("""CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer 
+@add_start_docstrings(
+    """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer 
     on top of the pooled output) e.g. for GLUE tasks. """,
-    CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
+    CAMEMBERT_START_DOCSTRING,
+    CAMEMBERT_INPUTS_DOCSTRING,
+)
 class CamembertForSequenceClassification(RobertaForSequenceClassification):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -223,9 +238,12 @@ class CamembertForSequenceClassification(RobertaForSequenceClassification):
     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
-@add_start_docstrings("""CamemBERT Model with a multiple choice classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """CamemBERT Model with a multiple choice classification head on top (a linear layer on top of
     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
+    CAMEMBERT_START_DOCSTRING,
+    CAMEMBERT_INPUTS_DOCSTRING,
+)
 class CamembertForMultipleChoice(RobertaForMultipleChoice):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -257,9 +275,12 @@ class CamembertForMultipleChoice(RobertaForMultipleChoice):
     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
-@add_start_docstrings("""CamemBERT Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """CamemBERT Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
+    CAMEMBERT_START_DOCSTRING,
+    CAMEMBERT_INPUTS_DOCSTRING,
+)
 class CamembertForTokenClassification(RobertaForTokenClassification):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
diff --git a/transformers/modeling_ctrl.py b/transformers/modeling_ctrl.py
index fabb79efd8..37c15cf54f 100644
--- a/transformers/modeling_ctrl.py
+++ b/transformers/modeling_ctrl.py
@@ -40,14 +40,17 @@ CTRL_PRETRAINED_MODEL_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-
 
 
 def angle_defn(pos, i, d_model_size):
-    angle_rates = 1 / torch.pow(10000, (2 * (i//2)) / d_model_size)
+    angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size)
     return pos * angle_rates
 
+
 def positional_encoding(position, d_model_size, dtype):
     # create the sinusoidal pattern for the positional encoding
-    angle_rads = (angle_defn(torch.arange(position, dtype=dtype).unsqueeze(1),
-                  torch.arange(d_model_size, dtype=dtype).unsqueeze(0),
-                  d_model_size))
+    angle_rads = angle_defn(
+        torch.arange(position, dtype=dtype).unsqueeze(1),
+        torch.arange(d_model_size, dtype=dtype).unsqueeze(0),
+        d_model_size,
+    )
 
     sines = torch.sin(angle_rads[:, 0::2])
     cosines = torch.cos(angle_rads[:, 1::2])
@@ -55,22 +58,23 @@ def positional_encoding(position, d_model_size, dtype):
     pos_encoding = torch.cat([sines, cosines], dim=-1)
     return pos_encoding
 
+
 def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
     # calculate attention
-    matmul_qk = torch.matmul(q, k.permute(0,1,3,2))
+    matmul_qk = torch.matmul(q, k.permute(0, 1, 3, 2))
 
     dk = k.shape[-1]
     scaled_attention_logits = matmul_qk / np.sqrt(dk)
 
     if mask is not None:
         nd, ns = scaled_attention_logits.size(-2), scaled_attention_logits.size(-1)
-        scaled_attention_logits += (mask[ns-nd:ns, :ns] * -1e4)
+        scaled_attention_logits += mask[ns - nd : ns, :ns] * -1e4
 
     if attention_mask is not None:
         # Apply the attention mask
         scaled_attention_logits = scaled_attention_logits + attention_mask
 
-    attention_weights = torch.softmax(scaled_attention_logits, dim=-1) 
+    attention_weights = torch.softmax(scaled_attention_logits, dim=-1)
 
     # Mask heads if we want to
     if head_mask is not None:
@@ -128,11 +132,8 @@ class MultiHeadAttention(torch.nn.Module):
         return outputs
 
 
-
 def point_wise_feed_forward_network(d_model_size, dff):
-    return torch.nn.Sequential(torch.nn.Linear(d_model_size, dff),
-                               torch.nn.ReLU(),
-                               torch.nn.Linear(dff, d_model_size))
+    return torch.nn.Sequential(torch.nn.Linear(d_model_size, dff), torch.nn.ReLU(), torch.nn.Linear(dff, d_model_size))
 
 
 class EncoderLayer(torch.nn.Module):
@@ -150,10 +151,9 @@ class EncoderLayer(torch.nn.Module):
 
     def forward(self, x, mask, layer_past=None, attention_mask=None, head_mask=None):
         normed = self.layernorm1(x)
-        attn_outputs = self.multi_head_attention(normed, normed, normed, mask,
-                                                      layer_past=layer_past,
-                                                      attention_mask=attention_mask,
-                                                      head_mask=head_mask)
+        attn_outputs = self.multi_head_attention(
+            normed, normed, normed, mask, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask
+        )
         attn_output = attn_outputs[0]
         attn_output = self.dropout1(attn_output)
         out1 = x + attn_output
@@ -171,6 +171,7 @@ class CTRLPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = CTRLConfig
     pretrained_model_archive_map = CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
@@ -244,8 +245,12 @@ CTRL_INPUTS_DOCSTRING = r"""    Inputs:
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
-                                            CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
+    CTRL_START_DOCSTRING,
+    CTRL_INPUTS_DOCSTRING,
+)
 class CTRLModel(CTRLPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -273,6 +278,7 @@ class CTRLModel(CTRLPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config):
         super(CTRLModel, self).__init__(config)
         self.output_hidden_states = config.output_hidden_states
@@ -287,11 +293,12 @@ class CTRLModel(CTRLPreTrainedModel):
         self.w = nn.Embedding(config.vocab_size, config.n_embd)
 
         self.dropout = nn.Dropout(config.embd_pdrop)
-        self.h = nn.ModuleList([EncoderLayer(config.n_embd,
-                                             config.n_head,
-                                             config.dff,
-                                             config.resid_pdrop,
-                                             config.output_attentions) for _ in range(config.n_layer)])
+        self.h = nn.ModuleList(
+            [
+                EncoderLayer(config.n_embd, config.n_head, config.dff, config.resid_pdrop, config.output_attentions)
+                for _ in range(config.n_layer)
+            ]
+        )
         self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
 
         self.init_weights()
@@ -309,7 +316,16 @@ class CTRLModel(CTRLPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.h[layer].attn.prune_heads(heads)
 
-    def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -345,7 +361,7 @@ class CTRLModel(CTRLPreTrainedModel):
             # positions we want to attend and -10000.0 for masked positions.
             # Since we are adding it to the raw scores before the softmax, this is
             # effectively the same as removing these entirely.
-            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
             attention_mask = (1.0 - attention_mask) * -10000.0
 
         # Prepare head mask if needed
@@ -357,8 +373,12 @@ class CTRLModel(CTRLPreTrainedModel):
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.n_layer
 
@@ -391,11 +411,9 @@ class CTRLModel(CTRLPreTrainedModel):
         for i, (h, layer_past) in enumerate(zip(self.h, past)):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
-            outputs = h(hidden_states,
-                        mask,
-                        layer_past=layer_past,
-                        attention_mask=attention_mask,
-                        head_mask=head_mask[i])
+            outputs = h(
+                hidden_states, mask, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i]
+            )
             hidden_states, present = outputs[:2]
             if self.output_past:
                 presents = presents + (present,)
@@ -421,8 +439,12 @@ class CTRLModel(CTRLPreTrainedModel):
         return outputs
 
 
-@add_start_docstrings("""The CTRL Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """The CTRL Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """,
+    CTRL_START_DOCSTRING,
+    CTRL_INPUTS_DOCSTRING,
+)
 class CTRLLMHeadModel(CTRLPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -463,6 +485,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(CTRLLMHeadModel, self).__init__(config)
         self.transformer = CTRLModel(config)
@@ -473,15 +496,26 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head
 
-    def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                labels=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               past=past,
-                                               attention_mask=attention_mask,
-                                               token_type_ids=token_type_ids,
-                                               position_ids=position_ids,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            past=past,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         hidden_states = transformer_outputs[0]
 
@@ -495,8 +529,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
             shift_labels = labels[..., 1:].contiguous()
             # Flatten the tokens
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
             outputs = (loss,) + outputs
 
         return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)
diff --git a/transformers/modeling_distilbert.py b/transformers/modeling_distilbert.py
index 7098529c9e..7345c23651 100644
--- a/transformers/modeling_distilbert.py
+++ b/transformers/modeling_distilbert.py
@@ -37,14 +37,15 @@ from .configuration_distilbert import DistilBertConfig
 from .file_utils import add_start_docstrings
 
 import logging
+
 logger = logging.getLogger(__name__)
 
 
 DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",
-    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin",
-    'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin",
-    'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-pytorch_model.bin",
+    "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",
+    "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin",
+    "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin",
+    "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-pytorch_model.bin",
 }
 
 
@@ -52,26 +53,24 @@ DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
 def gelu(x):
     return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
 
+
 def create_sinusoidal_embeddings(n_pos, dim, out):
-    position_enc = np.array([
-        [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
-        for pos in range(n_pos)
-    ])
+    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
     out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
     out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
     out.detach_()
     out.requires_grad = False
 
+
 class Embeddings(nn.Module):
-    def __init__(self,
-                 config):
+    def __init__(self, config):
         super(Embeddings, self).__init__()
         self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=0)
         self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
         if config.sinusoidal_pos_embds:
-            create_sinusoidal_embeddings(n_pos=config.max_position_embeddings,
-                                         dim=config.dim,
-                                         out=self.position_embeddings.weight)
+            create_sinusoidal_embeddings(
+                n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight
+            )
 
         self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12)
         self.dropout = nn.Dropout(config.dropout)
@@ -89,17 +88,18 @@ class Embeddings(nn.Module):
             The embedded tokens (plus position embeddings, no token_type embeddings)
         """
         seq_length = input_ids.size(1)
-        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length)
-        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)                      # (bs, max_seq_length)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)  # (max_seq_length)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)  # (bs, max_seq_length)
 
-        word_embeddings = self.word_embeddings(input_ids)                   # (bs, max_seq_length, dim)
-        position_embeddings = self.position_embeddings(position_ids)        # (bs, max_seq_length, dim)
+        word_embeddings = self.word_embeddings(input_ids)  # (bs, max_seq_length, dim)
+        position_embeddings = self.position_embeddings(position_ids)  # (bs, max_seq_length, dim)
 
         embeddings = word_embeddings + position_embeddings  # (bs, max_seq_length, dim)
-        embeddings = self.LayerNorm(embeddings)             # (bs, max_seq_length, dim)
-        embeddings = self.dropout(embeddings)               # (bs, max_seq_length, dim)
+        embeddings = self.LayerNorm(embeddings)  # (bs, max_seq_length, dim)
+        embeddings = self.dropout(embeddings)  # (bs, max_seq_length, dim)
         return embeddings
 
+
 class MultiHeadSelfAttention(nn.Module):
     def __init__(self, config):
         super(MultiHeadSelfAttention, self).__init__()
@@ -139,7 +139,7 @@ class MultiHeadSelfAttention(nn.Module):
         self.dim = attention_head_size * self.n_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    def forward(self, query, key, value, mask, head_mask = None):
+    def forward(self, query, key, value, mask, head_mask=None):
         """
         Parameters
         ----------
@@ -172,39 +172,42 @@ class MultiHeadSelfAttention(nn.Module):
             """ group heads """
             return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
 
-        q = shape(self.q_lin(query))           # (bs, n_heads, q_length, dim_per_head)
-        k = shape(self.k_lin(key))             # (bs, n_heads, k_length, dim_per_head)
-        v = shape(self.v_lin(value))           # (bs, n_heads, k_length, dim_per_head)
+        q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
+        k = shape(self.k_lin(key))  # (bs, n_heads, k_length, dim_per_head)
+        v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)
 
-        q = q / math.sqrt(dim_per_head)                     # (bs, n_heads, q_length, dim_per_head)
-        scores = torch.matmul(q, k.transpose(2,3))          # (bs, n_heads, q_length, k_length)
-        mask = (mask==0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length)
-        scores.masked_fill_(mask, -float('inf'))            # (bs, n_heads, q_length, k_length)
+        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
+        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, q_length, k_length)
+        mask = (mask == 0).view(mask_reshp).expand_as(scores)  # (bs, n_heads, q_length, k_length)
+        scores.masked_fill_(mask, -float("inf"))  # (bs, n_heads, q_length, k_length)
 
-        weights = nn.Softmax(dim=-1)(scores)   # (bs, n_heads, q_length, k_length)
-        weights = self.dropout(weights)        # (bs, n_heads, q_length, k_length)
+        weights = nn.Softmax(dim=-1)(scores)  # (bs, n_heads, q_length, k_length)
+        weights = self.dropout(weights)  # (bs, n_heads, q_length, k_length)
 
         # Mask heads if we want to
         if head_mask is not None:
             weights = weights * head_mask
 
-        context = torch.matmul(weights, v)     # (bs, n_heads, q_length, dim_per_head)
-        context = unshape(context)             # (bs, q_length, dim)
-        context = self.out_lin(context)        # (bs, q_length, dim)
+        context = torch.matmul(weights, v)  # (bs, n_heads, q_length, dim_per_head)
+        context = unshape(context)  # (bs, q_length, dim)
+        context = self.out_lin(context)  # (bs, q_length, dim)
 
         if self.output_attentions:
             return (context, weights)
         else:
             return (context,)
 
+
 class FFN(nn.Module):
     def __init__(self, config):
         super(FFN, self).__init__()
         self.dropout = nn.Dropout(p=config.dropout)
         self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim)
         self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim)
-        assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation)
-        self.activation = gelu if config.activation == 'gelu' else nn.ReLU()
+        assert config.activation in ["relu", "gelu"], "activation ({}) must be in ['relu', 'gelu']".format(
+            config.activation
+        )
+        self.activation = gelu if config.activation == "gelu" else nn.ReLU()
 
     def forward(self, input):
         x = self.lin1(input)
@@ -213,6 +216,7 @@ class FFN(nn.Module):
         x = self.dropout(x)
         return x
 
+
 class TransformerBlock(nn.Module):
     def __init__(self, config):
         super(TransformerBlock, self).__init__()
@@ -249,14 +253,14 @@ class TransformerBlock(nn.Module):
         # Self-Attention
         sa_output = self.attention(query=x, key=x, value=x, mask=attn_mask, head_mask=head_mask)
         if self.output_attentions:
-            sa_output, sa_weights = sa_output                  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
-        else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples
+            sa_output, sa_weights = sa_output  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
+        else:  # To handle these `output_attention` or `output_hidden_states` cases returning tuples
             assert type(sa_output) == tuple
             sa_output = sa_output[0]
-        sa_output = self.sa_layer_norm(sa_output + x)          # (bs, seq_length, dim)
+        sa_output = self.sa_layer_norm(sa_output + x)  # (bs, seq_length, dim)
 
         # Feed Forward Network
-        ffn_output = self.ffn(sa_output)                             # (bs, seq_length, dim)
+        ffn_output = self.ffn(sa_output)  # (bs, seq_length, dim)
         ffn_output = self.output_layer_norm(ffn_output + sa_output)  # (bs, seq_length, dim)
 
         output = (ffn_output,)
@@ -303,9 +307,7 @@ class Transformer(nn.Module):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_state,)
 
-            layer_outputs = layer_module(x=hidden_state,
-                                         attn_mask=attn_mask,
-                                         head_mask=head_mask[i])
+            layer_outputs = layer_module(x=hidden_state, attn_mask=attn_mask, head_mask=head_mask[i])
             hidden_state = layer_outputs[-1]
 
             if self.output_attentions:
@@ -332,6 +334,7 @@ class DistilBertPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for downloading and loading pretrained models.
     """
+
     config_class = DistilBertConfig
     pretrained_model_archive_map = DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = None
@@ -396,8 +399,12 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class DistilBertModel(DistilBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -420,11 +427,12 @@ class DistilBertModel(DistilBertPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config):
         super(DistilBertModel, self).__init__(config)
 
-        self.embeddings = Embeddings(config)   # Embeddings
-        self.transformer = Transformer(config) # Encoder
+        self.embeddings = Embeddings(config)  # Embeddings
+        self.transformer = Transformer(config)  # Encoder
 
         self.init_weights()
 
@@ -442,8 +450,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.transformer.layer[layer].attention.prune_heads(heads)
 
-    def forward(self,
-                input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None):
+    def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None):
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -456,7 +463,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
         device = input_ids.device if input_ids is not None else inputs_embeds.device
 
         if attention_mask is None:
-            attention_mask = torch.ones(input_shape, device=device) # (bs, seq_length)
+            attention_mask = torch.ones(input_shape, device=device)  # (bs, seq_length)
 
         # Prepare head mask if needed
         # 1.0 in head_mask indicate we keep the head
@@ -468,24 +475,29 @@ class DistilBertModel(DistilBertPreTrainedModel):
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.num_hidden_layers
 
         if inputs_embeds is None:
-            inputs_embeds = self.embeddings(input_ids)   # (bs, seq_length, dim)
-        tfmr_output = self.transformer(x=inputs_embeds,
-                                       attn_mask=attention_mask,
-                                       head_mask=head_mask)
+            inputs_embeds = self.embeddings(input_ids)  # (bs, seq_length, dim)
+        tfmr_output = self.transformer(x=inputs_embeds, attn_mask=attention_mask, head_mask=head_mask)
         hidden_state = tfmr_output[0]
-        output = (hidden_state, ) + tfmr_output[1:]
+        output = (hidden_state,) + tfmr_output[1:]
 
-        return output # last-layer hidden-state, (all hidden_states), (all attentions)
+        return output  # last-layer hidden-state, (all hidden_states), (all attentions)
 
 
-@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """,
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """DistilBert Model with a `masked language modeling` head on top. """,
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class DistilBertForMaskedLM(DistilBertPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -516,6 +528,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
         loss, prediction_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(DistilBertForMaskedLM, self).__init__(config)
         self.output_attentions = config.output_attentions
@@ -534,28 +547,31 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
         return self.vocab_projector
 
     def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None):
-        dlbrt_output = self.distilbert(input_ids=input_ids,
-                                       attention_mask=attention_mask,
-                                       head_mask=head_mask,
-                                       inputs_embeds=inputs_embeds)
-        hidden_states = dlbrt_output[0]                              # (bs, seq_length, dim)
-        prediction_logits = self.vocab_transform(hidden_states)      # (bs, seq_length, dim)
-        prediction_logits = gelu(prediction_logits)                  # (bs, seq_length, dim)
-        prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim)
+        dlbrt_output = self.distilbert(
+            input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
+        )
+        hidden_states = dlbrt_output[0]  # (bs, seq_length, dim)
+        prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)
+        prediction_logits = gelu(prediction_logits)  # (bs, seq_length, dim)
+        prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
         prediction_logits = self.vocab_projector(prediction_logits)  # (bs, seq_length, vocab_size)
 
-        outputs = (prediction_logits, ) + dlbrt_output[1:]
+        outputs = (prediction_logits,) + dlbrt_output[1:]
         if masked_lm_labels is not None:
-            mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)),
-                                         masked_lm_labels.view(-1))
-            outputs = (mlm_loss,) + outputs     
+            mlm_loss = self.mlm_loss_fct(
+                prediction_logits.view(-1, prediction_logits.size(-1)), masked_lm_labels.view(-1)
+            )
+            outputs = (mlm_loss,) + outputs
 
-        return outputs # (mlm_loss), prediction_logits, (all hidden_states), (all attentions)
+        return outputs  # (mlm_loss), prediction_logits, (all hidden_states), (all attentions)
 
 
-@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
                          the pooled output) e.g. for GLUE tasks. """,
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -587,6 +603,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(DistilBertForSequenceClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -599,16 +616,15 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
         self.init_weights()
 
     def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None):
-        distilbert_output = self.distilbert(input_ids=input_ids,
-                                            attention_mask=attention_mask,
-                                            head_mask=head_mask,
-                                            inputs_embeds=inputs_embeds)
-        hidden_state = distilbert_output[0]                    # (bs, seq_len, dim)
-        pooled_output = hidden_state[:, 0]                    # (bs, dim)
-        pooled_output = self.pre_classifier(pooled_output)   # (bs, dim)
-        pooled_output = nn.ReLU()(pooled_output)             # (bs, dim)
-        pooled_output = self.dropout(pooled_output)         # (bs, dim)
-        logits = self.classifier(pooled_output)              # (bs, dim)
+        distilbert_output = self.distilbert(
+            input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
+        )
+        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
+        pooled_output = hidden_state[:, 0]  # (bs, dim)
+        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
+        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
+        pooled_output = self.dropout(pooled_output)  # (bs, dim)
+        logits = self.classifier(pooled_output)  # (bs, dim)
 
         outputs = (logits,) + distilbert_output[1:]
         if labels is not None:
@@ -623,9 +639,12 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
         return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
                          the hidden-states output to compute `span start logits` and `span end logits`). """,
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -663,6 +682,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
         loss, start_scores, end_scores = outputs[:3]
 
     """
+
     def __init__(self, config):
         super(DistilBertForQuestionAnswering, self).__init__(config)
 
@@ -672,19 +692,26 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
         self.dropout = nn.Dropout(config.qa_dropout)
 
         self.init_weights()
-        
-    def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None):
-        distilbert_output = self.distilbert(input_ids=input_ids,
-                                            attention_mask=attention_mask,
-                                            head_mask=head_mask,
-                                            inputs_embeds=inputs_embeds)
-        hidden_states = distilbert_output[0]                                 # (bs, max_query_len, dim)
 
-        hidden_states = self.dropout(hidden_states)                       # (bs, max_query_len, dim)
-        logits = self.qa_outputs(hidden_states)                           # (bs, max_query_len, 2)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+    ):
+        distilbert_output = self.distilbert(
+            input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
+        )
+        hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)
+
+        hidden_states = self.dropout(hidden_states)  # (bs, max_query_len, dim)
+        logits = self.qa_outputs(hidden_states)  # (bs, max_query_len, 2)
         start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)                           # (bs, max_query_len)
-        end_logits = end_logits.squeeze(-1)                               # (bs, max_query_len)
+        start_logits = start_logits.squeeze(-1)  # (bs, max_query_len)
+        end_logits = end_logits.squeeze(-1)  # (bs, max_query_len)
 
         outputs = (start_logits, end_logits,) + distilbert_output[1:]
         if start_positions is not None and end_positions is not None:
@@ -707,10 +734,12 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
         return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""DistilBert Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """DistilBert Model with a token classification head on top (a linear layer on top of
                       the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-                      DISTILBERT_START_DOCSTRING,
-                      DISTILBERT_INPUTS_DOCSTRING)
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class DistilBertForTokenClassification(DistilBertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -740,6 +769,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
         loss, scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(DistilBertForTokenClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -750,13 +780,11 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, head_mask=None,
-                inputs_embeds=None, labels=None):
+    def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None):
 
-        outputs = self.distilbert(input_ids,
-                            attention_mask=attention_mask,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+        outputs = self.distilbert(
+            input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
+        )
 
         sequence_output = outputs[0]
 
diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py
index ddfebdc393..e5bad422c4 100644
--- a/transformers/modeling_encoder_decoder.py
+++ b/transformers/modeling_encoder_decoder.py
@@ -145,16 +145,12 @@ class PreTrainedEncoderDecoder(nn.Module):
         # by the value of the flag `is_decoder` that we need to set correctly.
         encoder = kwargs_encoder.pop("model", None)
         if encoder is None:
-            encoder = AutoModel.from_pretrained(
-                encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder
-            )
+            encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
         encoder.config.is_decoder = False
 
         decoder = kwargs_decoder.pop("model", None)
         if decoder is None:
-            decoder = AutoModelWithLMHead.from_pretrained(
-                decoder_pretrained_model_name_or_path, **kwargs_decoder
-            )
+            decoder = AutoModelWithLMHead.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
         decoder.config.is_decoder = True
 
         model = cls(encoder, decoder)
@@ -168,18 +164,23 @@ class PreTrainedEncoderDecoder(nn.Module):
         We save the encoder' and decoder's parameters in two separate directories.
         """
 
-        # If the root output directory does not exist, create it 
+        # If the root output directory does not exist, create it
         if not os.path.exists(save_directory):
             os.mkdir(save_directory)
 
         # Check whether the output directory is empty or not
-        sub_directories = [directory for directory in os.listdir(save_directory)
-            if os.path.isdir(os.path.join(save_directory, directory))]
+        sub_directories = [
+            directory
+            for directory in os.listdir(save_directory)
+            if os.path.isdir(os.path.join(save_directory, directory))
+        ]
 
         if len(sub_directories) > 0:
             if "encoder" in sub_directories and "decoder" in sub_directories:
-                print("WARNING: there is an older version of encoder-decoder saved in" +\
-                    " the output directory. The default behaviour is to overwrite them.")
+                print(
+                    "WARNING: there is an older version of encoder-decoder saved in"
+                    + " the output directory. The default behaviour is to overwrite them."
+                )
 
             # Empty the output directory
             for directory_to_remove in sub_directories:
@@ -190,7 +191,7 @@ class PreTrainedEncoderDecoder(nn.Module):
                 # Remove the subdirectory itself
                 os.rmdir(os.path.join(save_directory, directory_to_remove))
 
-            assert(len(os.listdir(save_directory)) == 0) # sanity check
+            assert len(os.listdir(save_directory)) == 0  # sanity check
 
         # Create the "encoder" directory inside the output directory and save the encoder into it
         if not os.path.exists(os.path.join(save_directory, "encoder")):
diff --git a/transformers/modeling_gpt2.py b/transformers/modeling_gpt2.py
index 3a7561ca58..fe8a973f0b 100644
--- a/transformers/modeling_gpt2.py
+++ b/transformers/modeling_gpt2.py
@@ -36,11 +36,14 @@ from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
-GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
-                                     "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
-                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin",
-                                     "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-pytorch_model.bin",
-                                     "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",}
+GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
+    "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
+    "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin",
+    "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-pytorch_model.bin",
+    "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",
+}
+
 
 def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
     """ Load tf checkpoints in a pytorch model
@@ -50,8 +53,10 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise
     tf_path = os.path.abspath(gpt2_checkpoint_path)
     logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
@@ -67,20 +72,20 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
 
     for name, array in zip(names, arrays):
         name = name[6:]  # skip "model/"
-        name = name.split('/')
+        name = name.split("/")
         pointer = model
         for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
-                l = re.split(r'(\d+)', m_name)
+            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
+                l = re.split(r"(\d+)", m_name)
             else:
                 l = [m_name]
-            if l[0] == 'w' or l[0] == 'g':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'b':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'wpe' or l[0] == 'wte':
+            if l[0] == "w" or l[0] == "g":
+                pointer = getattr(pointer, "weight")
+            elif l[0] == "b":
+                pointer = getattr(pointer, "bias")
+            elif l[0] == "wpe" or l[0] == "wte":
                 pointer = getattr(pointer, l[0])
-                pointer = getattr(pointer, 'weight')
+                pointer = getattr(pointer, "weight")
             else:
                 pointer = getattr(pointer, l[0])
             if len(l) >= 2:
@@ -130,7 +135,7 @@ class Attention(nn.Module):
             mask[head] = 0
         mask = mask.view(-1).contiguous().eq(1)
         index = torch.arange(len(mask))[mask].long()
-        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
+        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
 
         # Prune conv1d layers
         self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
@@ -146,7 +151,7 @@ class Attention(nn.Module):
         if self.scale:
             w = w / math.sqrt(v.size(-1))
         nd, ns = w.size(-2), w.size(-1)
-        b = self.bias[:, :, ns-nd:ns, :ns]
+        b = self.bias[:, :, ns - nd : ns, :ns]
         w = w * b - 1e4 * (1 - b)
 
         if attention_mask is not None:
@@ -226,10 +231,9 @@ class Block(nn.Module):
         self.mlp = MLP(4 * nx, config)
 
     def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
-        output_attn = self.attn(self.ln_1(x),
-                                layer_past=layer_past,
-                                attention_mask=attention_mask,
-                                head_mask=head_mask)
+        output_attn = self.attn(
+            self.ln_1(x), layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask
+        )
         a = output_attn[0]  # output_attn: a, present, (attentions)
 
         x = x + a
@@ -244,6 +248,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = GPT2Config
     pretrained_model_archive_map = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_gpt2
@@ -321,8 +326,12 @@ GPT2_INPUTS_DOCSTRING = r"""    Inputs:
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
-                      GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
+    GPT2_START_DOCSTRING,
+    GPT2_INPUTS_DOCSTRING,
+)
 class GPT2Model(GPT2PreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -350,6 +359,7 @@ class GPT2Model(GPT2PreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config):
         super(GPT2Model, self).__init__(config)
         self.output_hidden_states = config.output_hidden_states
@@ -377,7 +387,16 @@ class GPT2Model(GPT2PreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.h[layer].attn.prune_heads(heads)
 
-    def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -418,7 +437,7 @@ class GPT2Model(GPT2PreTrainedModel):
             # positions we want to attend and -10000.0 for masked positions.
             # Since we are adding it to the raw scores before the softmax, this is
             # effectively the same as removing these entirely.
-            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
             attention_mask = (1.0 - attention_mask) * -10000.0
 
         # Prepare head mask if needed
@@ -430,8 +449,12 @@ class GPT2Model(GPT2PreTrainedModel):
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.n_layer
 
@@ -454,10 +477,9 @@ class GPT2Model(GPT2PreTrainedModel):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
 
-            outputs = block(hidden_states,
-                            layer_past=layer_past,
-                            attention_mask=attention_mask,
-                            head_mask=head_mask[i])
+            outputs = block(
+                hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i]
+            )
 
             hidden_states, present = outputs[:2]
             if self.output_past:
@@ -486,8 +508,12 @@ class GPT2Model(GPT2PreTrainedModel):
         return outputs  # last hidden state, (presents), (all hidden_states), (attentions)
 
 
-@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """The GPT2 Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """,
+    GPT2_START_DOCSTRING,
+    GPT2_INPUTS_DOCSTRING,
+)
 class GPT2LMHeadModel(GPT2PreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -528,6 +554,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(GPT2LMHeadModel, self).__init__(config)
         self.transformer = GPT2Model(config)
@@ -538,15 +565,26 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head
 
-    def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                labels=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               past=past,
-                                               attention_mask=attention_mask,
-                                               token_type_ids=token_type_ids,
-                                               position_ids=position_ids,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            past=past,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
         hidden_states = transformer_outputs[0]
 
         lm_logits = self.lm_head(hidden_states)
@@ -558,18 +596,21 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
             shift_labels = labels[..., 1:].contiguous()
             # Flatten the tokens
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
             outputs = (loss,) + outputs
 
         return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)
 
 
-@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
+@add_start_docstrings(
+    """The GPT2 Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
-""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+""",
+    GPT2_START_DOCSTRING,
+    GPT2_INPUTS_DOCSTRING,
+)
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     r"""
         **mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
@@ -632,6 +673,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(GPT2DoubleHeadsModel, self).__init__(config)
         config.num_labels = 1
@@ -644,15 +686,28 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head
 
-    def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                mc_token_ids=None, lm_labels=None, mc_labels=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               past=past,
-                                               attention_mask=attention_mask,
-                                               token_type_ids=token_type_ids,
-                                               position_ids=position_ids,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        mc_token_ids=None,
+        lm_labels=None,
+        mc_labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            past=past,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         hidden_states = transformer_outputs[0]
 
@@ -662,15 +717,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
         if mc_labels is not None:
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
-                            mc_labels.view(-1))
+            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
             outputs = (loss,) + outputs
         if lm_labels is not None:
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = lm_labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
             outputs = (loss,) + outputs
 
         return outputs  # (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions)
diff --git a/transformers/modeling_mmbt.py b/transformers/modeling_mmbt.py
index 79a717ba2a..1c173ac692 100644
--- a/transformers/modeling_mmbt.py
+++ b/transformers/modeling_mmbt.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """PyTorch MMBT model. """
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 
@@ -32,6 +31,7 @@ logger = logging.getLogger(__name__)
 class ModalEmbeddings(nn.Module):
     """Generic Modal Embeddings which takes in an encoder, and a transformer embedding.
     """
+
     def __init__(self, config, encoder, embeddings):
         super(ModalEmbeddings, self).__init__()
         self.config = config
@@ -62,7 +62,9 @@ class ModalEmbeddings(nn.Module):
             position_ids = position_ids.unsqueeze(0).expand(input_modal.size(0), seq_length)
 
         if token_type_ids is None:
-            token_type_ids = torch.zeros((input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device)
+            token_type_ids = torch.zeros(
+                (input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device
+            )
 
         position_embeddings = self.position_embeddings(position_ids)
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
@@ -140,8 +142,12 @@ MMBT_INPUTS_DOCSTRING = r"""    Inputs:
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
 """
 
-@add_start_docstrings("The bare MMBT Model outputting raw hidden-states without any specific head on top.",
-                      MMBT_START_DOCSTRING, MMBT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare MMBT Model outputting raw hidden-states without any specific head on top.",
+    MMBT_START_DOCSTRING,
+    MMBT_INPUTS_DOCSTRING,
+)
 class MMBTModel(nn.Module):
     r"""
         Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -167,19 +173,29 @@ class MMBTModel(nn.Module):
             encoder = ImageEncoder(args)
             mmbt = MMBTModel(config, transformer, encoder)
         """
+
     def __init__(self, config, transformer, encoder):
         super(MMBTModel, self).__init__()
         self.config = config
         self.transformer = transformer
         self.modal_encoder = ModalEmbeddings(config, encoder, transformer.embeddings)
 
-    def forward(self, input_modal, input_ids=None, modal_start_tokens=None,
-                modal_end_tokens=None, attention_mask=None,
-                token_type_ids=None, modal_token_type_ids=None,
-                position_ids=None, modal_position_ids=None, head_mask=None,
-                inputs_embeds=None, encoder_hidden_states=None,
-                encoder_attention_mask=None):
-
+    def forward(
+        self,
+        input_modal,
+        input_ids=None,
+        modal_start_tokens=None,
+        modal_end_tokens=None,
+        attention_mask=None,
+        token_type_ids=None,
+        modal_token_type_ids=None,
+        position_ids=None,
+        modal_position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
 
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -192,21 +208,22 @@ class MMBTModel(nn.Module):
 
         device = input_ids.device if input_ids is not None else inputs_embeds.device
 
-        modal_embeddings = self.modal_encoder(input_modal,
-                                              start_token=modal_start_tokens,
-                                              end_token=modal_end_tokens,
-                                              position_ids=modal_position_ids,
-                                              token_type_ids=modal_token_type_ids)
+        modal_embeddings = self.modal_encoder(
+            input_modal,
+            start_token=modal_start_tokens,
+            end_token=modal_end_tokens,
+            position_ids=modal_position_ids,
+            token_type_ids=modal_token_type_ids,
+        )
 
         input_modal_shape = modal_embeddings.size()[:-1]
 
         if token_type_ids is None:
             token_type_ids = torch.ones(input_txt_shape, dtype=torch.long, device=device)
 
-        txt_embeddings = self.transformer.embeddings(input_ids=input_ids,
-                                                     position_ids=position_ids,
-                                                     token_type_ids=token_type_ids,
-                                                     inputs_embeds=inputs_embeds)
+        txt_embeddings = self.transformer.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
 
         embedding_output = torch.cat([modal_embeddings, txt_embeddings], 1)
 
@@ -215,12 +232,16 @@ class MMBTModel(nn.Module):
         if attention_mask is None:
             attention_mask = torch.ones(input_shape, device=device)
         else:
-            attention_mask = torch.cat([torch.ones(input_modal_shape, device=device, dtype=torch.long),  attention_mask], dim=1)
+            attention_mask = torch.cat(
+                [torch.ones(input_modal_shape, device=device, dtype=torch.long), attention_mask], dim=1
+            )
 
         if encoder_attention_mask is None:
             encoder_attention_mask = torch.ones(input_shape, device=device)
         else:
-            encoder_attention_mask = torch.cat([torch.ones(input_modal_shape, device=device),  encoder_attention_mask], dim=1)
+            encoder_attention_mask = torch.cat(
+                [torch.ones(input_modal_shape, device=device), encoder_attention_mask], dim=1
+            )
 
         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
@@ -254,7 +275,9 @@ class MMBTModel(nn.Module):
         if encoder_attention_mask.dim() == 2:
             encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
 
-        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        encoder_extended_attention_mask = encoder_extended_attention_mask.to(
+            dtype=next(self.parameters()).dtype
+        )  # fp16 compatibility
         encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
 
         # Prepare head mask if needed
@@ -267,25 +290,31 @@ class MMBTModel(nn.Module):
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype)  # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.num_hidden_layers
 
-
-        encoder_outputs = self.transformer.encoder(embedding_output,
-                                                   attention_mask=extended_attention_mask,
-                                                   head_mask=head_mask,
-                                                   encoder_hidden_states=encoder_hidden_states,
-                                                   encoder_attention_mask=encoder_extended_attention_mask)
+        encoder_outputs = self.transformer.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+        )
 
         sequence_output = encoder_outputs[0]
         pooled_output = self.transformer.pooler(sequence_output)
 
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[
+            1:
+        ]  # add hidden_states and attentions if they are here
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
 
-
     def get_input_embeddings(self):
         return self.embeddings.word_embeddings
 
@@ -293,8 +322,12 @@ class MMBTModel(nn.Module):
         self.embeddings.word_embeddings = value
 
 
-@add_start_docstrings("""MMBT Model with a sequence classification/regression head on top (a linear layer on top of
-                      the pooled output)""", MMBT_START_DOCSTRING, MMBT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """MMBT Model with a sequence classification/regression head on top (a linear layer on top of
+                      the pooled output)""",
+    MMBT_START_DOCSTRING,
+    MMBT_INPUTS_DOCSTRING,
+)
 class MMBTForClassification(nn.Module):
     r"""
             **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -333,20 +366,35 @@ class MMBTForClassification(nn.Module):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
 
-    def forward(self, input_modal, input_ids=None, modal_start_tokens=None, modal_end_tokens=None,
-                attention_mask=None, token_type_ids=None, modal_token_type_ids=None, position_ids=None,
-                modal_position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
+    def forward(
+        self,
+        input_modal,
+        input_ids=None,
+        modal_start_tokens=None,
+        modal_end_tokens=None,
+        attention_mask=None,
+        token_type_ids=None,
+        modal_token_type_ids=None,
+        position_ids=None,
+        modal_position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
 
-        outputs = self.mmbt(input_modal=input_modal, input_ids=input_ids,
-                            modal_start_tokens=modal_start_tokens,
-                            modal_end_tokens=modal_end_tokens,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            modal_token_type_ids=modal_token_type_ids,
-                            position_ids=position_ids,
-                            modal_position_ids=modal_position_ids,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+        outputs = self.mmbt(
+            input_modal=input_modal,
+            input_ids=input_ids,
+            modal_start_tokens=modal_start_tokens,
+            modal_end_tokens=modal_end_tokens,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            modal_token_type_ids=modal_token_type_ids,
+            position_ids=position_ids,
+            modal_position_ids=modal_position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         pooled_output = outputs[1]
 
@@ -365,4 +413,4 @@ class MMBTForClassification(nn.Module):
                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
             outputs = (loss,) + outputs
 
-        return outputs  # (loss), logits, (hidden_states), (attentions)
\ No newline at end of file
+        return outputs  # (loss), logits, (hidden_states), (attentions)
diff --git a/transformers/modeling_openai.py b/transformers/modeling_openai.py
index 2f08b4093d..ed746ecac8 100644
--- a/transformers/modeling_openai.py
+++ b/transformers/modeling_openai.py
@@ -36,7 +36,9 @@ from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
-OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
+OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"
+}
 
 
 def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
@@ -45,17 +47,17 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
     import re
     import numpy as np
 
-    if '.ckpt' in openai_checkpoint_folder_path:
+    if ".ckpt" in openai_checkpoint_folder_path:
         openai_checkpoint_folder_path = os.path.dirname(openai_checkpoint_folder_path)
 
     logger.info("Loading weights from {}".format(openai_checkpoint_folder_path))
 
-    with open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8') as names_handle:
+    with open(openai_checkpoint_folder_path + "/parameters_names.json", "r", encoding="utf-8") as names_handle:
         names = json.load(names_handle)
-    with open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8') as shapes_handle:
+    with open(openai_checkpoint_folder_path + "/params_shapes.json", "r", encoding="utf-8") as shapes_handle:
         shapes = json.load(shapes_handle)
     offsets = np.cumsum([np.prod(shape) for shape in shapes])
-    init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
+    init_params = [np.load(openai_checkpoint_folder_path + "/params_{}.npy".format(n)) for n in range(10)]
     init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
     init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
 
@@ -79,23 +81,23 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
     init_params.pop(0)
     init_params.pop(0)
 
-    for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]):
+    for name, array in zip(names, init_params):  # names[1:n_transfer], init_params[1:n_transfer]):
         name = name[6:]  # skip "model/"
         assert name[-2:] == ":0"
         name = name[:-2]
-        name = name.split('/')
+        name = name.split("/")
         pointer = model
         for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
-                l = re.split(r'(\d+)', m_name)
+            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
+                l = re.split(r"(\d+)", m_name)
             else:
                 l = [m_name]
-            if l[0] == 'g':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'b':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'w':
-                pointer = getattr(pointer, 'weight')
+            if l[0] == "g":
+                pointer = getattr(pointer, "weight")
+            elif l[0] == "b":
+                pointer = getattr(pointer, "bias")
+            elif l[0] == "w":
+                pointer = getattr(pointer, "weight")
             else:
                 pointer = getattr(pointer, l[0])
             if len(l) >= 2:
@@ -156,7 +158,7 @@ class Attention(nn.Module):
             mask[head] = 0
         mask = mask.view(-1).contiguous().eq(1)
         index = torch.arange(len(mask))[mask].long()
-        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
+        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
         # Prune conv1d layers
         self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
         self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
@@ -172,7 +174,7 @@ class Attention(nn.Module):
         # w = w * self.bias + -1e9 * (1 - self.bias)  # TF implem method: mask_attn_weights
         # XD: self.b may be larger than w, so we need to crop it
         b = self.bias[:, :, : w.size(-2), : w.size(-1)]
-        w = w * b + - 1e4 * (1 - b)
+        w = w * b + -1e4 * (1 - b)
 
         if attention_mask is not None:
             # Apply the attention mask
@@ -261,6 +263,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = OpenAIGPTConfig
     pretrained_model_archive_map = OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_openai_gpt
@@ -330,8 +333,12 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.",
-                      OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.",
+    OPENAI_GPT_START_DOCSTRING,
+    OPENAI_GPT_INPUTS_DOCSTRING,
+)
 class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -354,6 +361,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config):
         super(OpenAIGPTModel, self).__init__(config)
         self.output_attentions = config.output_attentions
@@ -379,7 +387,15 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.h[layer].attn.prune_heads(heads)
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -410,7 +426,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
             # positions we want to attend and -10000.0 for masked positions.
             # Since we are adding it to the raw scores before the softmax, this is
             # effectively the same as removing these entirely.
-            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
             attention_mask = (1.0 - attention_mask) * -10000.0
 
         # Prepare head mask if needed
@@ -422,8 +438,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.n_layer
 
@@ -463,8 +483,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         return outputs  # last hidden state, (all hidden states), (all attentions)
 
 
-@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """OpenAI GPT Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """,
+    OPENAI_GPT_START_DOCSTRING,
+    OPENAI_GPT_INPUTS_DOCSTRING,
+)
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -496,6 +520,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(OpenAIGPTLMHeadModel, self).__init__(config)
         self.transformer = OpenAIGPTModel(config)
@@ -506,14 +531,24 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                labels=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               attention_mask=attention_mask,
-                                               token_type_ids=token_type_ids,
-                                               position_ids=position_ids,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
         hidden_states = transformer_outputs[0]
         lm_logits = self.lm_head(hidden_states)
 
@@ -524,18 +559,21 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
             shift_labels = labels[..., 1:].contiguous()
             # Flatten the tokens
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
             outputs = (loss,) + outputs
 
         return outputs  # (loss), lm_logits, (all hidden states), (all attentions)
 
 
-@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
+@add_start_docstrings(
+    """OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
-""", OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+""",
+    OPENAI_GPT_START_DOCSTRING,
+    OPENAI_GPT_INPUTS_DOCSTRING,
+)
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     r"""
         **mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
@@ -587,6 +625,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
 
@@ -600,14 +639,26 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                mc_token_ids=None, lm_labels=None, mc_labels=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               attention_mask=attention_mask,
-                                               token_type_ids=token_type_ids,
-                                               position_ids=position_ids,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        mc_token_ids=None,
+        lm_labels=None,
+        mc_labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
         hidden_states = transformer_outputs[0]
 
         lm_logits = self.lm_head(hidden_states)
@@ -616,15 +667,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
         if mc_labels is not None:
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
-                            mc_labels.view(-1))
+            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
             outputs = (loss,) + outputs
         if lm_labels is not None:
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = lm_labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
             outputs = (loss,) + outputs
 
         return outputs  # (lm loss), (mc loss), lm logits, mc logits, (all hidden_states), (attentions)
diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py
index 4faab46f7a..730058ea92 100644
--- a/transformers/modeling_roberta.py
+++ b/transformers/modeling_roberta.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """PyTorch RoBERTa model. """
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 
@@ -31,24 +30,27 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
-    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
-    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
-    'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin",
-    'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-pytorch_model.bin",
-    'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-pytorch_model.bin",
+    "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
+    "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
+    "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
+    "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin",
+    "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-pytorch_model.bin",
+    "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-pytorch_model.bin",
 }
 
+
 class RobertaEmbeddings(BertEmbeddings):
     """
     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
     """
+
     def __init__(self, config):
         super(RobertaEmbeddings, self).__init__(config)
         self.padding_idx = 1
         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size,
-                                                padding_idx=self.padding_idx)
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
 
     def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
         if position_ids is None:
@@ -58,10 +60,9 @@ class RobertaEmbeddings(BertEmbeddings):
             else:
                 position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
 
-        return super(RobertaEmbeddings, self).forward(input_ids,
-                                                      token_type_ids=token_type_ids,
-                                                      position_ids=position_ids,
-                                                      inputs_embeds=inputs_embeds)
+        return super(RobertaEmbeddings, self).forward(
+            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds
+        )
 
     def create_position_ids_from_input_ids(self, x):
         """ Replace non-padding symbols with their position numbers. Position numbers begin at
@@ -85,8 +86,9 @@ class RobertaEmbeddings(BertEmbeddings):
         input_shape = inputs_embeds.size()[:-1]
         sequence_length = input_shape[1]
 
-        position_ids = torch.arange(self.padding_idx+1, sequence_length+self.padding_idx+1, dtype=torch.long,
-                                    device=inputs_embeds.device)
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
         return position_ids.unsqueeze(0).expand(input_shape)
 
 
@@ -162,8 +164,12 @@ ROBERTA_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
-                      ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class RobertaModel(BertModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -209,8 +215,10 @@ class RobertaModel(BertModel):
     def set_input_embeddings(self, value):
         self.embeddings.word_embeddings = value
 
-@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    """RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING
+)
 class RobertaForMaskedLM(BertPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -256,14 +264,24 @@ class RobertaForMaskedLM(BertPreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head.decoder
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                masked_lm_labels=None):
-        outputs = self.roberta(input_ids,
-                               attention_mask=attention_mask,
-                               token_type_ids=token_type_ids,
-                               position_ids=position_ids,
-                               head_mask=head_mask,
-                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        masked_lm_labels=None,
+    ):
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
         sequence_output = outputs[0]
         prediction_scores = self.lm_head(sequence_output)
 
@@ -299,9 +317,12 @@ class RobertaLMHead(nn.Module):
         return x
 
 
-@add_start_docstrings("""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
+@add_start_docstrings(
+    """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
     on top of the pooled output) e.g. for GLUE tasks. """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class RobertaForSequenceClassification(BertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -343,15 +364,25 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
 
         self.roberta = RobertaModel(config)
         self.classifier = RobertaClassificationHead(config)
-    
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                labels=None):
-        outputs = self.roberta(input_ids,
-                               attention_mask=attention_mask,
-                               token_type_ids=token_type_ids,
-                               position_ids=position_ids,
-                               head_mask=head_mask,
-                               inputs_embeds=inputs_embeds)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
         sequence_output = outputs[0]
         logits = self.classifier(sequence_output)
 
@@ -369,9 +400,12 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
         return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Roberta Model with a multiple choice classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Roberta Model with a multiple choice classification head on top (a linear layer on top of
     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class RobertaForMultipleChoice(BertPreTrainedModel):
     r"""
     Inputs:
@@ -455,16 +489,29 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None,
-                position_ids=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        attention_mask=None,
+        labels=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
         num_choices = input_ids.shape[1]
 
         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
         flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
         flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
         flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        outputs = self.roberta(flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids,
-                            attention_mask=flat_attention_mask, head_mask=head_mask)
+        outputs = self.roberta(
+            flat_input_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            head_mask=head_mask,
+        )
         pooled_output = outputs[1]
 
         pooled_output = self.dropout(pooled_output)
@@ -481,9 +528,12 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
         return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Roberta Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Roberta Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class RobertaForTokenClassification(BertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -527,15 +577,25 @@ class RobertaForTokenClassification(BertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
 
-        outputs = self.roberta(input_ids,
-                               attention_mask=attention_mask,
-                               token_type_ids=token_type_ids,
-                               position_ids=position_ids,
-                               head_mask=head_mask,
-                               inputs_embeds=inputs_embeds)
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output = outputs[0]
 
@@ -577,9 +637,12 @@ class RobertaClassificationHead(nn.Module):
         return x
 
 
-@add_start_docstrings("""Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class RobertaForQuestionAnswering(BertPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -626,14 +689,24 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                start_positions=None, end_positions=None):
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        start_positions=None,
+        end_positions=None,
+    ):
 
-        outputs = self.roberta(input_ids,
-                               attention_mask=attention_mask,
-                               token_type_ids=token_type_ids,
-                               position_ids=position_ids,
-                               head_mask=head_mask)
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+        )
 
         sequence_output = outputs[0]
 
@@ -660,4 +733,4 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
             total_loss = (start_loss + end_loss) / 2
             outputs = (total_loss,) + outputs
 
-        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
\ No newline at end of file
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index 9baf69d02b..2ee8cd011b 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -41,11 +41,11 @@ logger = logging.getLogger(__name__)
 # for the pretrained weights provided with the models
 ####################################################
 T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
-    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin",
-    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin",
-    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin",
-    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin",
+    "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
+    "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin",
+    "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin",
+    "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin",
+    "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin",
 }
 
 ####################################################
@@ -60,8 +60,10 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise
     tf_path = os.path.abspath(tf_checkpoint_path)
     logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
@@ -76,26 +78,26 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
         tf_weights[name] = array
 
     for txt_name in names:
-        name = txt_name.split('/')
+        name = txt_name.split("/")
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
             logger.info("Skipping {}".format("/".join(name)))
             tf_weights.pop(txt_name, None)
             continue
-        if '_slot_' in name[-1]:
+        if "_slot_" in name[-1]:
             logger.info("Skipping {}".format("/".join(name)))
             tf_weights.pop(txt_name, None)
             continue
         pointer = model
         array = tf_weights[txt_name]
         for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                l = re.split(r"_(\d+)", m_name)
             else:
                 l = [m_name]
-            if l[0] in ['kernel', 'scale', 'embedding']:
-                pointer = getattr(pointer, 'weight')
+            if l[0] in ["kernel", "scale", "embedding"]:
+                pointer = getattr(pointer, "weight")
             # elif l[0] == 'scale':
             #     pointer = getattr(pointer, 'weight')
             # elif l[0] == 'output_bias' or l[0] == 'beta':
@@ -111,9 +113,9 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
             if len(l) >= 2:
                 num = int(l[1])
                 pointer = pointer[num]
-        if l[0] not in ['kernel', 'scale', 'embedding']:
-            pointer = getattr(pointer, 'weight')
-        if l[0] != 'embedding':
+        if l[0] not in ["kernel", "scale", "embedding"]:
+            pointer = getattr(pointer, "weight")
+        if l[0] != "embedding":
             logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name))
             array = np.transpose(array)
         try:
@@ -125,7 +127,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
         pointer.data = torch.from_numpy(array.astype(np.float32))
         tf_weights.pop(txt_name, None)
 
-    logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys())))
     # logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
     return model
 
@@ -136,6 +138,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
 # - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
 ####################################################
 
+
 class T5LayerNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """ Construct a layernorm module in the T5 style
@@ -228,10 +231,7 @@ class T5Attention(nn.Module):
         self.pruned_heads = self.pruned_heads.union(heads)
 
     @staticmethod
-    def _relative_position_bucket(relative_position,
-                                  bidirectional=True,
-                                  num_buckets=32,
-                                  max_distance=128):
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
         """
         Adapted from Mesh Tensorflow:
         https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
@@ -267,12 +267,12 @@ class T5Attention(nn.Module):
 
         # half of the buckets are for exact increments in positions
         max_exact = num_buckets // 2
-        is_small = (n < max_exact)
+        is_small = n < max_exact
 
         # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
         val_if_large = max_exact + (
-            torch.log(n.float() / max_exact)
-            / math.log(max_distance / max_exact) * (num_buckets - max_exact)).to(torch.long)
+            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
+        ).to(torch.long)
         val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
 
         ret += torch.where(is_small, n, val_if_large)
@@ -283,11 +283,13 @@ class T5Attention(nn.Module):
         context_position = torch.arange(qlen, dtype=torch.long)[:, None]
         memory_position = torch.arange(klen, dtype=torch.long)[None, :]
         relative_position = memory_position - context_position  # shape (qlen, klen)
-        rp_bucket = self._relative_position_bucket(relative_position,  # shape (qlen, klen)
-                                                   bidirectional=not self.is_decoder,
-                                                   num_buckets=self.relative_attention_num_buckets)
+        rp_bucket = self._relative_position_bucket(
+            relative_position,  # shape (qlen, klen)
+            bidirectional=not self.is_decoder,
+            num_buckets=self.relative_attention_num_buckets,
+        )
         values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
-        values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen)
+        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, qlen, klen)
         return values
 
     def forward(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None):
@@ -298,7 +300,7 @@ class T5Attention(nn.Module):
         # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
         bs, qlen, dim = input.size()
         if kv is None:
-            klen = qlen if cache is None else cache['slen'] + qlen
+            klen = qlen if cache is None else cache["slen"] + qlen
         else:
             klen = kv.size(1)
 
@@ -310,45 +312,45 @@ class T5Attention(nn.Module):
             """  compute context """
             return x.transpose(1, 2).contiguous().view(bs, -1, self.inner_dim)
 
-        q = shape(self.q(input))                                          # (bs, n_heads, qlen, dim_per_head)
+        q = shape(self.q(input))  # (bs, n_heads, qlen, dim_per_head)
         if kv is None:
-            k = shape(self.k(input))                                      # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v(input))                                      # (bs, n_heads, qlen, dim_per_head)
+            k = shape(self.k(input))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(input))  # (bs, n_heads, qlen, dim_per_head)
         elif cache is None or self.layer_id not in cache:
             k = v = kv
-            k = shape(self.k(k))                                          # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v(v))                                          # (bs, n_heads, qlen, dim_per_head)
+            k = shape(self.k(k))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(v))  # (bs, n_heads, qlen, dim_per_head)
 
         if cache is not None:
             if self.layer_id in cache:
                 if kv is None:
                     k_, v_ = cache[self.layer_id]
-                    k = torch.cat([k_, k], dim=2)                             # (bs, n_heads, klen, dim_per_head)
-                    v = torch.cat([v_, v], dim=2)                             # (bs, n_heads, klen, dim_per_head)
+                    k = torch.cat([k_, k], dim=2)  # (bs, n_heads, klen, dim_per_head)
+                    v = torch.cat([v_, v], dim=2)  # (bs, n_heads, klen, dim_per_head)
                 else:
                     k, v = cache[self.layer_id]
             cache[self.layer_id] = (k, v)
 
         # q = q / math.sqrt(dim_per_head)                                     # No scaling in T5
-        scores = torch.einsum('bnqd,bnkd->bnqk', q, k)                        # (bs, n_heads, qlen, klen)
+        scores = torch.einsum("bnqd,bnkd->bnqk", q, k)  # (bs, n_heads, qlen, klen)
 
         if position_bias is None:
             if not self.has_relative_attention_bias:
                 raise ValueError("No position_bias provided and no weights to compute position_bias")
             position_bias = self.compute_bias(qlen, klen)
             if mask is not None:
-                position_bias = position_bias + mask                          # (bs, n_heads, qlen, klen)
+                position_bias = position_bias + mask  # (bs, n_heads, qlen, klen)
 
         scores += position_bias
-        weights = F.softmax(scores.float(), dim=-1).type_as(scores)           # (bs, n_heads, qlen, klen)
+        weights = F.softmax(scores.float(), dim=-1).type_as(scores)  # (bs, n_heads, qlen, klen)
         weights = F.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
 
         # Mask heads if we want to
         if head_mask is not None:
             weights = weights * head_mask
 
-        context = torch.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
-        context = unshape(context)                                            # (bs, qlen, dim)
+        context = torch.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)  # (bs, qlen, dim)
 
         context = self.o(context)
 
@@ -369,10 +371,9 @@ class T5LayerSelfAttention(nn.Module):
 
     def forward(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None):
         norm_x = self.layer_norm(hidden_states)
-        attention_output = self.SelfAttention(norm_x,
-                                              mask=attention_mask,
-                                              position_bias=position_bias,
-                                              head_mask=head_mask)
+        attention_output = self.SelfAttention(
+            norm_x, mask=attention_mask, position_bias=position_bias, head_mask=head_mask
+        )
         y = attention_output[0]
         layer_output = hidden_states + self.dropout(y)
         outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
@@ -388,11 +389,9 @@ class T5LayerCrossAttention(nn.Module):
 
     def forward(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None):
         norm_x = self.layer_norm(hidden_states)
-        attention_output = self.EncDecAttention(norm_x,
-                                                mask=attention_mask,
-                                                kv=kv,
-                                                position_bias=position_bias,
-                                                head_mask=head_mask)
+        attention_output = self.EncDecAttention(
+            norm_x, mask=attention_mask, kv=kv, position_bias=position_bias, head_mask=head_mask
+        )
         y = attention_output[0]
         layer_output = hidden_states + self.dropout(y)
         outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
@@ -411,26 +410,36 @@ class T5Block(nn.Module):
         else:
             self.layer.append(T5LayerFF(config))
 
-    def forward(self, hidden_states, attention_mask=None, position_bias=None,
-                encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None,
-                head_mask=None):
-        self_attention_outputs = self.layer[0](hidden_states,
-                                                attention_mask=attention_mask,
-                                                position_bias=position_bias,
-                                                head_mask=head_mask)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        head_mask=None,
+    ):
+        self_attention_outputs = self.layer[0](
+            hidden_states, attention_mask=attention_mask, position_bias=position_bias, head_mask=head_mask
+        )
         hidden_states = self_attention_outputs[0]
         outputs = self_attention_outputs[1:]  # Keep self-attention outputs and relative position weights
 
         if not self.is_decoder:
             hidden_states = self.layer[1](hidden_states)
         else:
-            cross_attention_outputs = self.layer[1](hidden_states,
-                                                    kv=encoder_hidden_states,
-                                                    attention_mask=encoder_attention_mask,
-                                                    position_bias=encoder_decoder_position_bias,
-                                                    head_mask=head_mask)
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                kv=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                head_mask=head_mask,
+            )
             hidden_states = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[1:]  # Keep cross-attention outputs and relative position weights
+            outputs = (
+                outputs + cross_attention_outputs[1:]
+            )  # Keep cross-attention outputs and relative position weights
             hidden_states = self.layer[2](hidden_states)
 
         outputs = (hidden_states,) + outputs  # add attentions if we output them
@@ -441,6 +450,7 @@ class T5PreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = T5Config
     pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_t5
@@ -450,29 +460,31 @@ class T5PreTrainedModel(PreTrainedModel):
     def dummy_inputs(self):
         input_ids = torch.tensor(DUMMY_INPUTS)
         input_mask = torch.tensor(DUMMY_MASK)
-        dummy_inputs = {'decoder_input_ids': input_ids,
-                        'encoder_input_ids': input_ids,
-                        'decoder_attention_mask': input_mask}
+        dummy_inputs = {
+            "decoder_input_ids": input_ids,
+            "encoder_input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
         return dummy_inputs
 
     def _init_weights(self, module):
         """ Initialize the weights """
         factor = self.config.initializer_factor  # Used for testing weights initialization
         if isinstance(module, T5LayerNorm):
-            module.weight.data.fill_(factor*1.0)
+            module.weight.data.fill_(factor * 1.0)
         elif isinstance(module, (T5Model, T5WithLMHeadModel)):
             # Mesh TensorFlow embeddings initialization
             # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
-            module.shared.weight.data.normal_(mean=0.0, std=factor*1.0)
+            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
         elif isinstance(module, T5DenseReluDense):
             # Mesh TensorFlow FF initialization
             # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
             # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
-            module.wi.weight.data.normal_(mean=0.0, std=factor*((self.config.d_model) ** -0.5))
-            if hasattr(module.wi, 'bias') and module.wi.bias is not None:
+            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi, "bias") and module.wi.bias is not None:
                 module.wi.bias.data.zero_()
-            module.wo.weight.data.normal_(mean=0.0, std=factor*((self.config.d_ff) ** -0.5))
-            if hasattr(module.wo, 'bias') and module.wo.bias is not None:
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
                 module.wo.bias.data.zero_()
         elif isinstance(module, T5Attention):
             # Mesh TensorFlow attention initialization to avoid scaling before softmax
@@ -480,12 +492,12 @@ class T5PreTrainedModel(PreTrainedModel):
             d_model = self.config.d_model
             d_kv = self.config.d_kv
             n_heads = self.config.num_heads
-            module.q.weight.data.normal_(mean=0.0, std=factor*((d_model * d_kv) ** -0.5))
-            module.k.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5))
-            module.v.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5))
-            module.o.weight.data.normal_(mean=0.0, std=factor*((n_heads * d_kv) ** -0.5))
+            module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * d_kv) ** -0.5))
+            module.k.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5))
+            module.v.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5))
+            module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * d_kv) ** -0.5))
             if module.has_relative_attention_bias:
-                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor*((d_model) ** -0.5))
+                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
 
 
 class T5Stack(T5PreTrainedModel):
@@ -495,19 +507,22 @@ class T5Stack(T5PreTrainedModel):
         self.output_hidden_states = config.output_hidden_states
         self.is_decoder = config.is_decoder
 
-        self.block = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0))
-                                    for i in range(config.num_layers)])
+        self.block = nn.ModuleList(
+            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+        )
         self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
         self.dropout = nn.Dropout(config.dropout_rate)
 
         self.init_weights()
 
-    def forward(self,
-                hidden_states,
-                attention_mask=None,
-                encoder_hidden_states=None,
-                encoder_attention_mask=None,
-                head_mask=None):
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+    ):
 
         batch_size, seq_length = hidden_states.shape[0], hidden_states.shape[1]
         if attention_mask is None:
@@ -521,9 +536,9 @@ class T5Stack(T5PreTrainedModel):
         if attention_mask.dim() == 3:
             extended_attention_mask = attention_mask[:, None, :, :]
         elif attention_mask.dim() == 2:
-        # Provided a padding mask of dimensions [batch_size, seq_length]
-        # - if the model is a decoder, apply a causal mask in addition to the padding mask
-        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
             if self.config.is_decoder:
                 seq_ids = torch.arange(seq_length, device=hidden_states.device)
                 causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
@@ -557,7 +572,9 @@ class T5Stack(T5PreTrainedModel):
             # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
             # encoder_extended_attention_mask = (encoder_extended_attention_mask == encoder_extended_attention_mask.transpose(-1, -2))
 
-            encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+            encoder_extended_attention_mask = encoder_extended_attention_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # fp16 compatibility
             encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
         else:
             encoder_extended_attention_mask = None
@@ -572,8 +589,12 @@ class T5Stack(T5PreTrainedModel):
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.config.num_layers, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.num_layers
 
@@ -587,13 +608,15 @@ class T5Stack(T5PreTrainedModel):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
-            layer_outputs = layer_module(hidden_states,
-                                         attention_mask=extended_attention_mask,
-                                         position_bias=position_bias,
-                                         encoder_hidden_states=encoder_hidden_states,
-                                         encoder_attention_mask=encoder_extended_attention_mask,
-                                         encoder_decoder_position_bias=encoder_decoder_position_bias,
-                                         head_mask=head_mask[i])
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask=extended_attention_mask,
+                position_bias=position_bias,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_extended_attention_mask,
+                encoder_decoder_position_bias=encoder_decoder_position_bias,
+                head_mask=head_mask[i],
+            )
             # layer_outputs is a tuple with:
             # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
             hidden_states = layer_outputs[0]
@@ -672,9 +695,12 @@ T5_INPUTS_DOCSTRING = r"""
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
 
-@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states"
-                      "without any specific head on top.",
-                      T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.",
+    T5_START_DOCSTRING,
+    T5_INPUTS_DOCSTRING,
+)
 class T5Model(T5PreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -697,6 +723,7 @@ class T5Model(T5PreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config):
         super(T5Model, self).__init__(config)
         self.shared = nn.Embedding(config.vocab_size, config.d_model)
@@ -729,12 +756,13 @@ class T5Model(T5PreTrainedModel):
         # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
         # that apply to the model as whole.
         # We let the specific kwargs override the common ones in case of conflict.
-        kwargs_common = dict((k, v) for k, v in kwargs.items()
-                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_common = dict(
+            (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")
+        )
         kwargs_encoder = kwargs_common.copy()
         kwargs_decoder = kwargs_common.copy()
-        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
-        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+        kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))
 
         # Encode if needed (training, first prediction pass)
         encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
@@ -770,8 +798,7 @@ class T5Model(T5PreTrainedModel):
         return decoder_outputs + encoder_outputs
 
 
-@add_start_docstrings("""T5 Model with a `language modeling` head on top. """,
-    T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
 class T5WithLMHeadModel(T5PreTrainedModel):
     r"""
         **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -802,6 +829,7 @@ class T5WithLMHeadModel(T5PreTrainedModel):
         loss, prediction_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(T5WithLMHeadModel, self).__init__(config)
         self.model_dim = config.d_model
@@ -834,14 +862,15 @@ class T5WithLMHeadModel(T5PreTrainedModel):
         # that apply to the model as whole.
         # We let the specific kwargs override the common ones in case of conflict.
 
-        lm_labels = kwargs.pop('decoder_lm_labels', None)
+        lm_labels = kwargs.pop("decoder_lm_labels", None)
 
-        kwargs_common = dict((k, v) for k, v in kwargs.items()
-                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_common = dict(
+            (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")
+        )
         kwargs_encoder = kwargs_common.copy()
         kwargs_decoder = kwargs_common.copy()
-        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
-        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+        kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))
 
         # Encode if needed (training, first prediction pass)
         encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
@@ -879,8 +908,9 @@ class T5WithLMHeadModel(T5PreTrainedModel):
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = lm_labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
-            decoder_outputs = (loss,) + decoder_outputs  # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            decoder_outputs = (
+                loss,
+            ) + decoder_outputs  # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
 
         return decoder_outputs + encoder_outputs
diff --git a/transformers/modeling_tf_albert.py b/transformers/modeling_tf_albert.py
index ac55a73fa3..25d0863987 100644
--- a/transformers/modeling_tf_albert.py
+++ b/transformers/modeling_tf_albert.py
@@ -31,14 +31,14 @@ import logging
 logger = logging.getLogger(__name__)
 
 TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tf_model.h5",
-    'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-tf_model.h5",
-    'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-tf_model.h5",
-    'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-tf_model.h5",
-    'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-tf_model.h5",
-    'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-tf_model.h5",
-    'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-tf_model.h5",
-    'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-tf_model.h5",
+    "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tf_model.h5",
+    "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-tf_model.h5",
+    "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-tf_model.h5",
+    "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-tf_model.h5",
+    "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-tf_model.h5",
+    "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-tf_model.h5",
+    "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-tf_model.h5",
+    "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-tf_model.h5",
 }
 
 
@@ -50,21 +50,22 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
         super(TFAlbertEmbeddings, self).__init__(**kwargs)
 
         self.config = config
-        self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
-                                                             config.embedding_size,
-                                                             embeddings_initializer=get_initializer(
-                                                                 self.config.initializer_range),
-                                                             name='position_embeddings')
-        self.token_type_embeddings = tf.keras.layers.Embedding(config.type_vocab_size,
-                                                               config.embedding_size,
-                                                               embeddings_initializer=get_initializer(
-                                                                   self.config.initializer_range),
-                                                               name='token_type_embeddings')
+        self.position_embeddings = tf.keras.layers.Embedding(
+            config.max_position_embeddings,
+            config.embedding_size,
+            embeddings_initializer=get_initializer(self.config.initializer_range),
+            name="position_embeddings",
+        )
+        self.token_type_embeddings = tf.keras.layers.Embedding(
+            config.type_vocab_size,
+            config.embedding_size,
+            embeddings_initializer=get_initializer(self.config.initializer_range),
+            name="token_type_embeddings",
+        )
 
         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
         # any TensorFlow checkpoint file
-        self.LayerNorm = tf.keras.layers.LayerNormalization(
-            epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
 
     def build(self, input_shape):
@@ -75,7 +76,8 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
             self.word_embeddings = self.add_weight(
                 "weight",
                 shape=[self.config.vocab_size, self.config.embedding_size],
-                initializer=get_initializer(self.config.initializer_range))
+                initializer=get_initializer(self.config.initializer_range),
+            )
         super(TFAlbertEmbeddings, self).build(input_shape)
 
     def call(self, inputs, mode="embedding", training=False):
@@ -145,34 +147,29 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer):
         if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
         self.output_attentions = config.output_attentions
 
         self.num_attention_heads = config.num_attention_heads
         assert config.hidden_size % config.num_attention_heads == 0
-        self.attention_head_size = int(
-            config.hidden_size / config.num_attention_heads)
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
 
-        self.query = tf.keras.layers.Dense(self.all_head_size,
-                                           kernel_initializer=get_initializer(
-                                               config.initializer_range),
-                                           name='query')
-        self.key = tf.keras.layers.Dense(self.all_head_size,
-                                         kernel_initializer=get_initializer(
-                                             config.initializer_range),
-                                         name='key')
-        self.value = tf.keras.layers.Dense(self.all_head_size,
-                                           kernel_initializer=get_initializer(
-                                               config.initializer_range),
-                                           name='value')
+        self.query = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
 
-        self.dropout = tf.keras.layers.Dropout(
-            config.attention_probs_dropout_prob)
+        self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
 
     def transpose_for_scores(self, x, batch_size):
-        x = tf.reshape(
-            x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
+        x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
         return tf.transpose(x, perm=[0, 2, 1, 3])
 
     def call(self, inputs, training=False):
@@ -212,23 +209,21 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer):
         context_layer = tf.matmul(attention_probs, value_layer)
 
         context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
-        context_layer = tf.reshape(context_layer,
-                                   (batch_size, -1, self.all_head_size))  # (batch_size, seq_len_q, all_head_size)
+        context_layer = tf.reshape(
+            context_layer, (batch_size, -1, self.all_head_size)
+        )  # (batch_size, seq_len_q, all_head_size)
 
-        outputs = (context_layer, attention_probs) if self.output_attentions else (
-            context_layer,)
+        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
         return outputs
 
 
 class TFAlbertSelfOutput(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFAlbertSelfOutput, self).__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(
-                                               config.initializer_range),
-                                           name='dense')
-        self.LayerNorm = tf.keras.layers.LayerNormalization(
-            epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
 
     def call(self, inputs, training=False):
@@ -245,12 +240,10 @@ class TFAlbertAttention(TFBertSelfAttention):
         super(TFAlbertAttention, self).__init__(config, **kwargs)
 
         self.hidden_size = config.hidden_size
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(
-                                               config.initializer_range),
-                                           name='dense')
-        self.LayerNorm = tf.keras.layers.LayerNormalization(
-            epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
         self.pruned_heads = set()
 
     def prune_heads(self, heads):
@@ -293,11 +286,11 @@ class TFAlbertAttention(TFBertSelfAttention):
         context_layer = tf.matmul(attention_probs, value_layer)
 
         context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
-        context_layer = tf.reshape(context_layer,
-                                   (batch_size, -1, self.all_head_size))  # (batch_size, seq_len_q, all_head_size)
+        context_layer = tf.reshape(
+            context_layer, (batch_size, -1, self.all_head_size)
+        )  # (batch_size, seq_len_q, all_head_size)
 
-        self_outputs = (context_layer, attention_probs) if self.output_attentions else (
-            context_layer,)
+        self_outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
 
         hidden_states = self_outputs[0]
 
@@ -313,34 +306,35 @@ class TFAlbertAttention(TFBertSelfAttention):
 class TFAlbertLayer(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFAlbertLayer, self).__init__(**kwargs)
-        self.attention = TFAlbertAttention(config, name='attention')
+        self.attention = TFAlbertAttention(config, name="attention")
 
-        self.ffn = tf.keras.layers.Dense(config.intermediate_size, kernel_initializer=get_initializer(
-            config.initializer_range), name='ffn')
+        self.ffn = tf.keras.layers.Dense(
+            config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn"
+        )
 
         if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
             self.activation = ACT2FN[config.hidden_act]
         else:
             self.activation = config.hidden_act
 
-        self.ffn_output = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
-            config.initializer_range), name='ffn_output')
+        self.ffn_output = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn_output"
+        )
         self.full_layer_layer_norm = tf.keras.layers.LayerNormalization(
-            epsilon=config.layer_norm_eps, name='full_layer_layer_norm')
+            epsilon=config.layer_norm_eps, name="full_layer_layer_norm"
+        )
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
 
     def call(self, inputs, training=False):
         hidden_states, attention_mask, head_mask = inputs
 
-        attention_outputs = self.attention(
-            [hidden_states, attention_mask, head_mask], training=training)
+        attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training)
         ffn_output = self.ffn(attention_outputs[0])
         ffn_output = self.activation(ffn_output)
         ffn_output = self.ffn_output(ffn_output)
 
         hidden_states = self.dropout(hidden_states, training=training)
-        hidden_states = self.full_layer_layer_norm(
-            ffn_output + attention_outputs[0])
+        hidden_states = self.full_layer_layer_norm(ffn_output + attention_outputs[0])
 
         # add attentions if we output them
         outputs = (hidden_states,) + attention_outputs[1:]
@@ -353,8 +347,9 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
 
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
-        self.albert_layers = [TFAlbertLayer(config, name="albert_layers_._{}".format(
-            i)) for i in range(config.inner_group_num)]
+        self.albert_layers = [
+            TFAlbertLayer(config, name="albert_layers_._{}".format(i)) for i in range(config.inner_group_num)
+        ]
 
     def call(self, inputs, training=False):
         hidden_states, attention_mask, head_mask = inputs
@@ -363,8 +358,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
         layer_attentions = ()
 
         for layer_index, albert_layer in enumerate(self.albert_layers):
-            layer_output = albert_layer(
-                [hidden_states, attention_mask, head_mask[layer_index]], training=training)
+            layer_output = albert_layer([hidden_states, attention_mask, head_mask[layer_index]], training=training)
             hidden_states = layer_output[0]
 
             if self.output_attentions:
@@ -389,10 +383,15 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
         self.config = config
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
-        self.embedding_hidden_mapping_in = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
-            config.initializer_range), name='embedding_hidden_mapping_in')
-        self.albert_layer_groups = [TFAlbertLayerGroup(
-            config, name="albert_layer_groups_._{}".format(i)) for i in range(config.num_hidden_groups)]
+        self.embedding_hidden_mapping_in = tf.keras.layers.Dense(
+            config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="embedding_hidden_mapping_in",
+        )
+        self.albert_layer_groups = [
+            TFAlbertLayerGroup(config, name="albert_layer_groups_._{}".format(i))
+            for i in range(config.num_hidden_groups)
+        ]
 
     def call(self, inputs, training=False):
         hidden_states, attention_mask, head_mask = inputs
@@ -405,15 +404,19 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
 
         for i in range(self.config.num_hidden_layers):
             # Number of layers in a hidden group
-            layers_per_group = int(
-                self.config.num_hidden_layers / self.config.num_hidden_groups)
+            layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups)
 
             # Index of the hidden group
-            group_idx = int(
-                i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
+            group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
 
             layer_group_output = self.albert_layer_groups[group_idx](
-                [hidden_states, attention_mask, head_mask[group_idx*layers_per_group:(group_idx+1)*layers_per_group]], training=training)
+                [
+                    hidden_states,
+                    attention_mask,
+                    head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
+                ],
+                training=training,
+            )
             hidden_states = layer_group_output[0]
 
             if self.output_attentions:
@@ -436,6 +439,7 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = AlbertConfig
     pretrained_model_archive_map = TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "albert"
@@ -446,31 +450,25 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
         super(TFAlbertMLMHead, self).__init__(**kwargs)
         self.vocab_size = config.vocab_size
 
-        self.dense = tf.keras.layers.Dense(config.embedding_size,
-                                           kernel_initializer=get_initializer(
-                                               config.initializer_range),
-                                           name='dense')
+        self.dense = tf.keras.layers.Dense(
+            config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
         if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
             self.activation = ACT2FN[config.hidden_act]
         else:
             self.activation = config.hidden_act
 
-        self.LayerNorm = tf.keras.layers.LayerNormalization(
-            epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
 
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
         self.decoder = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='bias')
-        self.decoder_bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='decoder/bias')
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.decoder_bias = self.add_weight(
+            shape=(self.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
+        )
         super(TFAlbertMLMHead, self).build(input_shape)
 
     def call(self, hidden_states):
@@ -560,8 +558,12 @@ ALBERT_INPUTS_DOCSTRING = r"""
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
 
-@add_start_docstrings("The bare Albert Model transformer outputing raw hidden-states without any specific head on top.",
-                      ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare Albert Model transformer outputing raw hidden-states without any specific head on top.",
+    ALBERT_START_DOCSTRING,
+    ALBERT_INPUTS_DOCSTRING,
+)
 class TFAlbertModel(TFAlbertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -601,8 +603,12 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
 
         self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
         self.encoder = TFAlbertTransformer(config, name="encoder")
-        self.pooler = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
-            config.initializer_range), activation='tanh', name='pooler')
+        self.pooler = tf.keras.layers.Dense(
+            config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="pooler",
+        )
 
     def get_input_embeddings(self):
         return self.embeddings
@@ -617,7 +623,16 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
         """
         raise NotImplementedError
 
-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -627,12 +642,12 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
             inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
             assert len(inputs) <= 6, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 6, "Too many inputs."
         else:
             input_ids = inputs
@@ -678,10 +693,8 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
             head_mask = [None] * self.num_hidden_layers
             # head_mask = tf.constant([0] * self.num_hidden_layers)
 
-        embedding_output = self.embeddings(
-            [input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
-        encoder_outputs = self.encoder(
-            [embedding_output, extended_attention_mask, head_mask], training=training)
+        embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
+        encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
 
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output[:, 0])
@@ -692,8 +705,9 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
         return outputs
 
 
-@add_start_docstrings("""Albert Model with a `language modeling` head on top. """,
-                      ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING
+)
 class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -723,9 +737,8 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
     def __init__(self, config, *inputs, **kwargs):
         super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs)
 
-        self.albert = TFAlbertModel(config, name='albert')
-        self.predictions = TFAlbertMLMHead(
-            config, self.albert.embeddings, name='predictions')
+        self.albert = TFAlbertModel(config, name="albert")
+        self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions")
 
     def get_output_embeddings(self):
         return self.albert.embeddings
@@ -734,8 +747,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
         outputs = self.albert(inputs, **kwargs)
 
         sequence_output = outputs[0]
-        prediction_scores = self.predictions(
-            sequence_output, training=kwargs.get('training', False))
+        prediction_scores = self.predictions(sequence_output, training=kwargs.get("training", False))
 
         # Add hidden states and attention if they are here
         outputs = (prediction_scores,) + outputs[2:]
@@ -743,9 +755,12 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
         return outputs  # prediction_scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+    ALBERT_START_DOCSTRING,
+    ALBERT_INPUTS_DOCSTRING,
+)
 class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -771,24 +786,25 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
         logits = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.albert = TFAlbertModel(config, name='albert')
+        self.albert = TFAlbertModel(config, name="albert")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
 
     def call(self, inputs, **kwargs):
         outputs = self.albert(inputs, **kwargs)
 
         pooled_output = outputs[1]
 
-        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
+        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
         logits = self.classifier(pooled_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
 
-        return outputs  # logits, (hidden_states), (attentions)
\ No newline at end of file
+        return outputs  # logits, (hidden_states), (attentions)
diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py
index 031ffea17e..24a7338d41 100644
--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -18,24 +18,70 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
-from .configuration_auto import (BertConfig, CTRLConfig, DistilBertConfig,
-                                 GPT2Config, OpenAIGPTConfig, RobertaConfig,
-                                 TransfoXLConfig, XLMConfig, XLNetConfig)
+from .configuration_auto import (
+    BertConfig,
+    CTRLConfig,
+    DistilBertConfig,
+    GPT2Config,
+    OpenAIGPTConfig,
+    RobertaConfig,
+    TransfoXLConfig,
+    XLMConfig,
+    XLNetConfig,
+)
 
-from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, \
-    TFBertForQuestionAnswering, TFBertForTokenClassification, TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_bert import (
+    TFBertModel,
+    TFBertForMaskedLM,
+    TFBertForSequenceClassification,
+    TFBertForQuestionAnswering,
+    TFBertForTokenClassification,
+    TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
 from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel, TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel, TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, \
-    TFXLNetForQuestionAnsweringSimple, TFXLNetForTokenClassification, TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, \
-    TFXLMForQuestionAnsweringSimple, TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, \
-    TFRobertaForTokenClassification, TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification, TFDistilBertForTokenClassification, TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_transfo_xl import (
+    TFTransfoXLModel,
+    TFTransfoXLLMHeadModel,
+    TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
+from .modeling_tf_xlnet import (
+    TFXLNetModel,
+    TFXLNetLMHeadModel,
+    TFXLNetForSequenceClassification,
+    TFXLNetForQuestionAnsweringSimple,
+    TFXLNetForTokenClassification,
+    TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
+from .modeling_tf_xlm import (
+    TFXLMModel,
+    TFXLMWithLMHeadModel,
+    TFXLMForSequenceClassification,
+    TFXLMForQuestionAnsweringSimple,
+    TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
+from .modeling_tf_roberta import (
+    TFRobertaModel,
+    TFRobertaForMaskedLM,
+    TFRobertaForSequenceClassification,
+    TFRobertaForTokenClassification,
+    TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
+from .modeling_tf_distilbert import (
+    TFDistilBertModel,
+    TFDistilBertForQuestionAnswering,
+    TFDistilBertForMaskedLM,
+    TFDistilBertForSequenceClassification,
+    TFDistilBertForTokenClassification,
+    TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
 from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_albert import TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification, TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_albert import (
+    TFAlbertModel,
+    TFAlbertForMaskedLM,
+    TFAlbertForSequenceClassification,
+    TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
 from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .file_utils import add_start_docstrings
@@ -43,7 +89,8 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 
-TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
+TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict(
+    (key, value)
     for pretrained_map in [
         TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
@@ -56,8 +103,9 @@ TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
         TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP,
-        ]
-    for key, value, in pretrained_map.items())
+    ]
+    for key, value, in pretrained_map.items()
+)
 
 
 class TFAutoModel(object):
@@ -85,10 +133,13 @@ class TFAutoModel(object):
 
         This class cannot be instantiated using `__init__()` (throws an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("TFAutoModel is designed to be instantiated "
+        raise EnvironmentError(
+            "TFAutoModel is designed to be instantiated "
             "using the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` or "
-            "`TFAutoModel.from_config(config)` methods.")
+            "`TFAutoModel.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -209,32 +260,34 @@ class TFAutoModel(object):
             model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
 
         """
-        if 't5' in pretrained_model_name_or_path:
+        if "t5" in pretrained_model_name_or_path:
             return TFT5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
+        elif "distilbert" in pretrained_model_name_or_path:
             return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
             return TFAlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        elif "roberta" in pretrained_model_name_or_path:
             return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
             return TFBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'openai-gpt' in pretrained_model_name_or_path:
+        elif "openai-gpt" in pretrained_model_name_or_path:
             return TFOpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'gpt2' in pretrained_model_name_or_path:
+        elif "gpt2" in pretrained_model_name_or_path:
             return TFGPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'transfo-xl' in pretrained_model_name_or_path:
+        elif "transfo-xl" in pretrained_model_name_or_path:
             return TFTransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return TFXLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
             return TFXLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'ctrl' in pretrained_model_name_or_path:
+        elif "ctrl" in pretrained_model_name_or_path:
             return TFCTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+            "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path)
+        )
 
 
 class TFAutoModelWithLMHead(object):
@@ -262,10 +315,13 @@ class TFAutoModelWithLMHead(object):
 
         This class cannot be instantiated using `__init__()` (throws an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
+        raise EnvironmentError(
+            "TFAutoModelWithLMHead is designed to be instantiated "
             "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or "
-            "`TFAutoModelWithLMHead.from_config(config)` methods.")
+            "`TFAutoModelWithLMHead.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -390,32 +446,34 @@ class TFAutoModelWithLMHead(object):
             model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
 
         """
-        if 't5' in pretrained_model_name_or_path:
+        if "t5" in pretrained_model_name_or_path:
             return TFT5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
+        elif "distilbert" in pretrained_model_name_or_path:
             return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
             return TFAlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        elif "roberta" in pretrained_model_name_or_path:
             return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
             return TFBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'openai-gpt' in pretrained_model_name_or_path:
+        elif "openai-gpt" in pretrained_model_name_or_path:
             return TFOpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'gpt2' in pretrained_model_name_or_path:
+        elif "gpt2" in pretrained_model_name_or_path:
             return TFGPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'transfo-xl' in pretrained_model_name_or_path:
+        elif "transfo-xl" in pretrained_model_name_or_path:
             return TFTransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return TFXLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
             return TFXLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'ctrl' in pretrained_model_name_or_path:
+        elif "ctrl" in pretrained_model_name_or_path:
             return TFCTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+            "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path)
+        )
 
 
 class TFAutoModelForSequenceClassification(object):
@@ -438,10 +496,13 @@ class TFAutoModelForSequenceClassification(object):
 
         This class cannot be instantiated using `__init__()` (throws an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("TFAutoModelForSequenceClassification is designed to be instantiated "
+        raise EnvironmentError(
+            "TFAutoModelForSequenceClassification is designed to be instantiated "
             "using the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or "
-            "`TFAutoModelForSequenceClassification.from_config(config)` methods.")
+            "`TFAutoModelForSequenceClassification.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -552,21 +613,33 @@ class TFAutoModelForSequenceClassification(object):
             model = TFAutoModelForSequenceClassification.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
-            return TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
-            return TFAlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
-            return TFRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
-            return TFBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
-            return TFXLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        if "distilbert" in pretrained_model_name_or_path:
+            return TFDistilBertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "albert" in pretrained_model_name_or_path:
+            return TFAlbertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "roberta" in pretrained_model_name_or_path:
+            return TFRobertaForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "bert" in pretrained_model_name_or_path:
+            return TFBertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "xlnet" in pretrained_model_name_or_path:
+            return TFXLNetForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "xlm" in pretrained_model_name_or_path:
             return TFXLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'distilbert', 'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'distilbert', 'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path)
+        )
 
 
 class TFAutoModelForQuestionAnswering(object):
@@ -588,10 +661,13 @@ class TFAutoModelForQuestionAnswering(object):
 
         This class cannot be instantiated using `__init__()` (throws an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("TFAutoModelForQuestionAnswering is designed to be instantiated "
+        raise EnvironmentError(
+            "TFAutoModelForQuestionAnswering is designed to be instantiated "
             "using the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or "
-            "`TFAutoModelForQuestionAnswering.from_config(config)` methods.")
+            "`TFAutoModelForQuestionAnswering.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -698,24 +774,34 @@ class TFAutoModelForQuestionAnswering(object):
             model = TFAutoModelForQuestionAnswering.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
-            return TFDistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        if "distilbert" in pretrained_model_name_or_path:
+            return TFDistilBertForQuestionAnswering.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "bert" in pretrained_model_name_or_path:
             return TFBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
-            return TFXLNetForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
-            return TFXLMForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif "xlnet" in pretrained_model_name_or_path:
+            return TFXLNetForQuestionAnsweringSimple.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "xlm" in pretrained_model_name_or_path:
+            return TFXLMForQuestionAnsweringSimple.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
 
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'distilbert', 'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'distilbert', 'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path)
+        )
 
 
 class TFAutoModelForTokenClassification:
     def __init__(self):
-        raise EnvironmentError("TFAutoModelForTokenClassification is designed to be instantiated "
-                               "using the `TFAutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
-                               "`AutoModelForTokenClassification.from_config(config)` methods.")
+        raise EnvironmentError(
+            "TFAutoModelForTokenClassification is designed to be instantiated "
+            "using the `TFAutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
+            "`AutoModelForTokenClassification.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -815,14 +901,20 @@ class TFAutoModelForTokenClassification:
             model = TFAutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 'bert' in pretrained_model_name_or_path:
+        if "bert" in pretrained_model_name_or_path:
             return TFBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return TFXLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
-            return TFDistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
-            return TFRobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif "distilbert" in pretrained_model_name_or_path:
+            return TFDistilBertForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "roberta" in pretrained_model_name_or_path:
+            return TFRobertaForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
 
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'distilbert', 'roberta'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'xlnet', 'distilbert', 'roberta'".format(pretrained_model_name_or_path)
+        )
diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py
index 9caad53a5f..bcb83d5df7 100644
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -35,25 +35,25 @@ logger = logging.getLogger(__name__)
 
 
 TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tf_model.h5",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tf_model.h5",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tf_model.h5",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tf_model.h5",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-tf_model.h5",
-    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-tf_model.h5",
-    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-tf_model.h5",
-    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-tf_model.h5",
-    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
-    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
-    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
-    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
-    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
-    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
-    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5",
-    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5",
-    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5",
+    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5",
+    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tf_model.h5",
+    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tf_model.h5",
+    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tf_model.h5",
+    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tf_model.h5",
+    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5",
+    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-tf_model.h5",
+    "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-tf_model.h5",
+    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-tf_model.h5",
+    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-tf_model.h5",
+    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
+    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
+    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
+    "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
+    "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
+    "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
+    "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5",
+    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5",
+    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5",
 }
 
 
@@ -67,6 +67,7 @@ def gelu(x):
     cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
     return x * cdf
 
+
 def gelu_new(x):
     """Gaussian Error Linear Unit.
     This is a smoother version of the RELU.
@@ -76,41 +77,48 @@ def gelu_new(x):
     Returns:
         `x` with the GELU activation applied.
     """
-    cdf = 0.5 * (1.0 + tf.tanh(
-        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
     return x * cdf
 
+
 def swish(x):
     return x * tf.sigmoid(x)
 
 
-ACT2FN = {"gelu": tf.keras.layers.Activation(gelu),
-          "relu": tf.keras.activations.relu,
-          "swish": tf.keras.layers.Activation(swish),
-          "gelu_new": tf.keras.layers.Activation(gelu_new)}
+ACT2FN = {
+    "gelu": tf.keras.layers.Activation(gelu),
+    "relu": tf.keras.activations.relu,
+    "swish": tf.keras.layers.Activation(swish),
+    "gelu_new": tf.keras.layers.Activation(gelu_new),
+}
 
 
 class TFBertEmbeddings(tf.keras.layers.Layer):
     """Construct the embeddings from word, position and token_type embeddings.
     """
+
     def __init__(self, config, **kwargs):
         super(TFBertEmbeddings, self).__init__(**kwargs)
         self.vocab_size = config.vocab_size
         self.hidden_size = config.hidden_size
         self.initializer_range = config.initializer_range
 
-        self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
-                                                             config.hidden_size,
-                                                             embeddings_initializer=get_initializer(self.initializer_range),
-                                                             name='position_embeddings')
-        self.token_type_embeddings = tf.keras.layers.Embedding(config.type_vocab_size,
-                                                               config.hidden_size,
-                                                               embeddings_initializer=get_initializer(self.initializer_range),
-                                                               name='token_type_embeddings')
+        self.position_embeddings = tf.keras.layers.Embedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+            embeddings_initializer=get_initializer(self.initializer_range),
+            name="position_embeddings",
+        )
+        self.token_type_embeddings = tf.keras.layers.Embedding(
+            config.type_vocab_size,
+            config.hidden_size,
+            embeddings_initializer=get_initializer(self.initializer_range),
+            name="token_type_embeddings",
+        )
 
         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
         # any TensorFlow checkpoint file
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
 
     def build(self, input_shape):
@@ -121,7 +129,8 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
             self.word_embeddings = self.add_weight(
                 "weight",
                 shape=[self.vocab_size, self.hidden_size],
-                initializer=get_initializer(self.initializer_range))
+                initializer=get_initializer(self.initializer_range),
+            )
         super(TFBertEmbeddings, self).build(input_shape)
 
     def call(self, inputs, mode="embedding", training=False):
@@ -193,7 +202,8 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
         if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
         self.output_attentions = config.output_attentions
 
         self.num_attention_heads = config.num_attention_heads
@@ -201,15 +211,15 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
 
-        self.query = tf.keras.layers.Dense(self.all_head_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='query')
-        self.key = tf.keras.layers.Dense(self.all_head_size,
-                                         kernel_initializer=get_initializer(config.initializer_range),
-                                         name='key')
-        self.value = tf.keras.layers.Dense(self.all_head_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='value')
+        self.query = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
 
         self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
 
@@ -230,8 +240,10 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
         value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
 
         # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)  # (batch size, num_heads, seq_len_q, seq_len_k)
-        dk = tf.cast(shape_list(key_layer)[-1], tf.float32) # scale attention_scores
+        attention_scores = tf.matmul(
+            query_layer, key_layer, transpose_b=True
+        )  # (batch size, num_heads, seq_len_q, seq_len_k)
+        dk = tf.cast(shape_list(key_layer)[-1], tf.float32)  # scale attention_scores
         attention_scores = attention_scores / tf.math.sqrt(dk)
 
         if attention_mask is not None:
@@ -252,8 +264,9 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
         context_layer = tf.matmul(attention_probs, value_layer)
 
         context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
-        context_layer = tf.reshape(context_layer,
-                                  (batch_size, -1, self.all_head_size))  # (batch_size, seq_len_q, all_head_size)
+        context_layer = tf.reshape(
+            context_layer, (batch_size, -1, self.all_head_size)
+        )  # (batch_size, seq_len_q, all_head_size)
 
         outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
         return outputs
@@ -262,10 +275,10 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
 class TFBertSelfOutput(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFBertSelfOutput, self).__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='dense')
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
 
     def call(self, inputs, training=False):
@@ -280,8 +293,8 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
 class TFBertAttention(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFBertAttention, self).__init__(**kwargs)
-        self.self_attention = TFBertSelfAttention(config, name='self')
-        self.dense_output = TFBertSelfOutput(config, name='output')
+        self.self_attention = TFBertSelfAttention(config, name="self")
+        self.dense_output = TFBertSelfOutput(config, name="output")
 
     def prune_heads(self, heads):
         raise NotImplementedError
@@ -298,9 +311,9 @@ class TFBertAttention(tf.keras.layers.Layer):
 class TFBertIntermediate(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFBertIntermediate, self).__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(config.intermediate_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='dense')
+        self.dense = tf.keras.layers.Dense(
+            config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
         if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
             self.intermediate_act_fn = ACT2FN[config.hidden_act]
         else:
@@ -315,10 +328,10 @@ class TFBertIntermediate(tf.keras.layers.Layer):
 class TFBertOutput(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFBertOutput, self).__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='dense')
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
 
     def call(self, inputs, training=False):
@@ -333,9 +346,9 @@ class TFBertOutput(tf.keras.layers.Layer):
 class TFBertLayer(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFBertLayer, self).__init__(**kwargs)
-        self.attention = TFBertAttention(config, name='attention')
-        self.intermediate = TFBertIntermediate(config, name='intermediate')
-        self.bert_output = TFBertOutput(config, name='output')
+        self.attention = TFBertAttention(config, name="attention")
+        self.intermediate = TFBertIntermediate(config, name="intermediate")
+        self.bert_output = TFBertOutput(config, name="output")
 
     def call(self, inputs, training=False):
         hidden_states, attention_mask, head_mask = inputs
@@ -353,7 +366,7 @@ class TFBertEncoder(tf.keras.layers.Layer):
         super(TFBertEncoder, self).__init__(**kwargs)
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
-        self.layer = [TFBertLayer(config, name='layer_._{}'.format(i)) for i in range(config.num_hidden_layers)]
+        self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
 
     def call(self, inputs, training=False):
         hidden_states, attention_mask, head_mask = inputs
@@ -385,10 +398,12 @@ class TFBertEncoder(tf.keras.layers.Layer):
 class TFBertPooler(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFBertPooler, self).__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           activation='tanh',
-                                           name='dense')
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
 
     def call(self, hidden_states):
         # We "pool" the model by simply taking the hidden state corresponding
@@ -401,14 +416,14 @@ class TFBertPooler(tf.keras.layers.Layer):
 class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFBertPredictionHeadTransform, self).__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='dense')
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
         if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
             self.transform_act_fn = ACT2FN[config.hidden_act]
         else:
             self.transform_act_fn = config.hidden_act
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
 
     def call(self, hidden_states):
         hidden_states = self.dense(hidden_states)
@@ -421,17 +436,14 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
         super(TFBertLMPredictionHead, self).__init__(**kwargs)
         self.vocab_size = config.vocab_size
-        self.transform = TFBertPredictionHeadTransform(config, name='transform')
+        self.transform = TFBertPredictionHeadTransform(config, name="transform")
 
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='bias')
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
         super(TFBertLMPredictionHead, self).build(input_shape)
 
     def call(self, hidden_states):
@@ -444,7 +456,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
 class TFBertMLMHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
         super(TFBertMLMHead, self).__init__(**kwargs)
-        self.predictions = TFBertLMPredictionHead(config, input_embeddings, name='predictions')
+        self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions")
 
     def call(self, sequence_output):
         prediction_scores = self.predictions(sequence_output)
@@ -454,9 +466,9 @@ class TFBertMLMHead(tf.keras.layers.Layer):
 class TFBertNSPHead(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFBertNSPHead, self).__init__(**kwargs)
-        self.seq_relationship = tf.keras.layers.Dense(2,
-                                                      kernel_initializer=get_initializer(config.initializer_range),
-                                                      name='seq_relationship')
+        self.seq_relationship = tf.keras.layers.Dense(
+            2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship"
+        )
 
     def call(self, pooled_output):
         seq_relationship_score = self.seq_relationship(pooled_output)
@@ -468,9 +480,9 @@ class TFBertMainLayer(tf.keras.layers.Layer):
         super(TFBertMainLayer, self).__init__(**kwargs)
         self.num_hidden_layers = config.num_hidden_layers
 
-        self.embeddings = TFBertEmbeddings(config, name='embeddings')
-        self.encoder = TFBertEncoder(config, name='encoder')
-        self.pooler = TFBertPooler(config, name='pooler')
+        self.embeddings = TFBertEmbeddings(config, name="embeddings")
+        self.encoder = TFBertEncoder(config, name="encoder")
+        self.pooler = TFBertPooler(config, name="pooler")
 
     def get_input_embeddings(self):
         return self.embeddings
@@ -485,7 +497,16 @@ class TFBertMainLayer(tf.keras.layers.Layer):
         """
         raise NotImplementedError
 
-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -495,12 +516,12 @@ class TFBertMainLayer(tf.keras.layers.Layer):
             inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
             assert len(inputs) <= 6, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 6, "Too many inputs."
         else:
             input_ids = inputs
@@ -552,7 +573,9 @@ class TFBertMainLayer(tf.keras.layers.Layer):
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output)
 
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[
+            1:
+        ]  # add hidden_states and attentions if they are here
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
 
 
@@ -560,6 +583,7 @@ class TFBertPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = BertConfig
     pretrained_model_archive_map = TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "bert"
@@ -648,8 +672,12 @@ BERT_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
-                      BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertModel(TFBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -682,18 +710,22 @@ class TFBertModel(TFBertPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFBertModel, self).__init__(config, *inputs, **kwargs)
-        self.bert = TFBertMainLayer(config, name='bert')
+        self.bert = TFBertMainLayer(config, name="bert")
 
     def call(self, inputs, **kwargs):
         outputs = self.bert(inputs, **kwargs)
         return outputs
 
 
-@add_start_docstrings("""Bert Model with two heads on top as done during the pre-training:
+@add_start_docstrings(
+    """Bert Model with two heads on top as done during the pre-training:
     a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertForPreTraining(TFBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -721,12 +753,13 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
         prediction_scores, seq_relationship_scores = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFBertForPreTraining, self).__init__(config, *inputs, **kwargs)
 
-        self.bert = TFBertMainLayer(config, name='bert')
-        self.nsp = TFBertNSPHead(config, name='nsp___cls')
-        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls')
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.nsp = TFBertNSPHead(config, name="nsp___cls")
+        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
 
     def get_output_embeddings(self):
         return self.bert.embeddings
@@ -735,16 +768,19 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
         outputs = self.bert(inputs, **kwargs)
 
         sequence_output, pooled_output = outputs[:2]
-        prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
+        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
         seq_relationship_score = self.nsp(pooled_output)
 
-        outputs = (prediction_scores, seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
+        outputs = (prediction_scores, seq_relationship_score,) + outputs[
+            2:
+        ]  # add hidden states and attention if they are here
 
         return outputs  # prediction_scores, seq_relationship_score, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a `language modeling` head on top. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING
+)
 class TFBertForMaskedLM(TFBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -770,11 +806,12 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
         prediction_scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFBertForMaskedLM, self).__init__(config, *inputs, **kwargs)
 
-        self.bert = TFBertMainLayer(config, name='bert')
-        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls')
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
 
     def get_output_embeddings(self):
         return self.bert.embeddings
@@ -783,15 +820,18 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
         outputs = self.bert(inputs, **kwargs)
 
         sequence_output = outputs[0]
-        prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
+        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
 
         outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
 
         return outputs  # prediction_scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -817,11 +857,12 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
         seq_relationship_scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFBertForNextSentencePrediction, self).__init__(config, *inputs, **kwargs)
 
-        self.bert = TFBertMainLayer(config, name='bert')
-        self.nsp = TFBertNSPHead(config, name='nsp___cls')
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.nsp = TFBertNSPHead(config, name="nsp___cls")
 
     def call(self, inputs, **kwargs):
         outputs = self.bert(inputs, **kwargs)
@@ -834,9 +875,12 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
         return outputs  # seq_relationship_score, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertForSequenceClassification(TFBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -862,22 +906,23 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
         logits = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.bert = TFBertMainLayer(config, name='bert')
+        self.bert = TFBertMainLayer(config, name="bert")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
 
     def call(self, inputs, **kwargs):
         outputs = self.bert(inputs, **kwargs)
 
         pooled_output = outputs[1]
 
-        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
+        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
         logits = self.classifier(pooled_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
@@ -885,9 +930,12 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
         return outputs  # logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Bert Model with a multiple choice classification head on top (a linear layer on top of
     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertForMultipleChoice(TFBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -915,16 +963,26 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
         classification_scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFBertForMultipleChoice, self).__init__(config, *inputs, **kwargs)
 
-        self.bert = TFBertMainLayer(config, name='bert')
+        self.bert = TFBertMainLayer(config, name="bert")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(1,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
 
-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -934,12 +992,12 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
             inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
             assert len(inputs) <= 6, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 6, "Too many inputs."
         else:
             input_ids = inputs
@@ -956,7 +1014,14 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
         flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
         flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
 
-        flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds]
+        flat_inputs = [
+            flat_input_ids,
+            flat_attention_mask,
+            flat_token_type_ids,
+            flat_position_ids,
+            head_mask,
+            inputs_embeds,
+        ]
 
         outputs = self.bert(flat_inputs, training=training)
 
@@ -971,9 +1036,12 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
         return outputs  # reshaped_logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Bert Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertForTokenClassification(TFBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -999,22 +1067,23 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
         scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFBertForTokenClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.bert = TFBertMainLayer(config, name='bert')
+        self.bert = TFBertMainLayer(config, name="bert")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
 
     def call(self, inputs, **kwargs):
         outputs = self.bert(inputs, **kwargs)
 
         sequence_output = outputs[0]
 
-        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
+        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
         logits = self.classifier(sequence_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
@@ -1022,9 +1091,12 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
         return outputs  # scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertForQuestionAnswering(TFBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -1052,14 +1124,15 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
         start_scores, end_scores = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.bert = TFBertMainLayer(config, name='bert')
-        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='qa_outputs')
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.qa_outputs = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
 
     def call(self, inputs, **kwargs):
         outputs = self.bert(inputs, **kwargs)
diff --git a/transformers/modeling_tf_ctrl.py b/transformers/modeling_tf_ctrl.py
index 0f9b34924f..3aba94a50d 100644
--- a/transformers/modeling_tf_ctrl.py
+++ b/transformers/modeling_tf_ctrl.py
@@ -32,15 +32,15 @@ logger = logging.getLogger(__name__)
 
 TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.huggingface.co/bert/ctrl-tf_model.h5"}
 
+
 def angle_defn(pos, i, d_model_size):
-    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model_size))
+    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model_size))
     return pos * angle_rates
 
+
 def positional_encoding(position, d_model_size):
     # create the sinusoidal pattern for the positional encoding
-    angle_rads = angle_defn(np.arange(position)[:, np.newaxis],
-                            np.arange(d_model_size)[np.newaxis, :],
-                            d_model_size)
+    angle_rads = angle_defn(np.arange(position)[:, np.newaxis], np.arange(d_model_size)[np.newaxis, :], d_model_size)
 
     sines = np.sin(angle_rads[:, 0::2])
     cosines = np.cos(angle_rads[:, 1::2])
@@ -49,27 +49,28 @@ def positional_encoding(position, d_model_size):
     pos_encoding = tf.cast(np.concatenate([sines, cosines], axis=-1), dtype=tf.float32)
     return pos_encoding
 
+
 def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
     # calculate attention
     matmul_qk = tf.matmul(q, k, transpose_b=True)
-    
+
     dk = tf.cast(shape_list(k)[-1], tf.float32)
     scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
 
     if mask is not None:
-        scaled_attention_logits += (mask * -1e4)
+        scaled_attention_logits += mask * -1e4
 
     if attention_mask is not None:
         # Apply the attention mask
         scaled_attention_logits = scaled_attention_logits + attention_mask
 
-    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) 
+    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
 
     # Mask heads if we want to
     if head_mask is not None:
         attention_weights = attention_weights * head_mask
 
-    output = tf.matmul(attention_weights, v) 
+    output = tf.matmul(attention_weights, v)
 
     return output, attention_weights
 
@@ -83,11 +84,11 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
 
         self.depth = int(d_model_size / self.num_heads)
 
-        self.Wq = tf.keras.layers.Dense(d_model_size, name='Wq')
-        self.Wk = tf.keras.layers.Dense(d_model_size, name='Wk')
-        self.Wv = tf.keras.layers.Dense(d_model_size, name='Wv')
+        self.Wq = tf.keras.layers.Dense(d_model_size, name="Wq")
+        self.Wk = tf.keras.layers.Dense(d_model_size, name="Wk")
+        self.Wv = tf.keras.layers.Dense(d_model_size, name="Wv")
 
-        self.dense = tf.keras.layers.Dense(d_model_size, name='dense')
+        self.dense = tf.keras.layers.Dense(d_model_size, name="dense")
 
     def split_into_heads(self, x, batch_size):
         x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
@@ -113,7 +114,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
         output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask)
         scaled_attention = tf.transpose(output[0], perm=[0, 2, 1, 3])
         attn = output[1]
-        original_size_attention = tf.reshape(scaled_attention,  (batch_size, -1, self.d_model_size))
+        original_size_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model_size))
         output = self.dense(original_size_attention)
 
         outputs = (output, present)
@@ -122,22 +123,22 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
         return outputs
 
 
-
 def point_wise_feed_forward_network(d_model_size, dff, name=""):
-    return tf.keras.Sequential([
-            tf.keras.layers.Dense(dff, activation='relu', name="0"), 
-            tf.keras.layers.Dense(d_model_size, name="2")
-        ], name="ffn")
+    return tf.keras.Sequential(
+        [tf.keras.layers.Dense(dff, activation="relu", name="0"), tf.keras.layers.Dense(d_model_size, name="2")],
+        name="ffn",
+    )
 
 
 class TFEncoderLayer(tf.keras.layers.Layer):
-    def __init__(self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs):
+    def __init__(
+        self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs
+    ):
         super(TFEncoderLayer, self).__init__(**kwargs)
 
-        self.multi_head_attention = TFMultiHeadAttention(d_model_size,
-                                                         num_heads,
-                                                         output_attentions,
-                                                         name="multi_head_attention")
+        self.multi_head_attention = TFMultiHeadAttention(
+            d_model_size, num_heads, output_attentions, name="multi_head_attention"
+        )
         self.ffn = point_wise_feed_forward_network(d_model_size, dff, name="ffn")
 
         self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1")
@@ -149,8 +150,9 @@ class TFEncoderLayer(tf.keras.layers.Layer):
     def call(self, inputs, training=False):
         x, mask, layer_past, attention_mask, head_mask = inputs
         normed = self.layernorm1(x)
-        attn_outputs = self.multi_head_attention([normed, normed, normed, mask, layer_past,
-                                                  attention_mask, head_mask], training=training)
+        attn_outputs = self.multi_head_attention(
+            [normed, normed, normed, mask, layer_past, attention_mask, head_mask], training=training
+        )
         attn_output = attn_outputs[0]
         attn_output = self.dropout1(attn_output, training=training)
         out1 = x + attn_output
@@ -176,20 +178,23 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
 
         self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size)
 
-
-        self.w = TFSharedEmbeddings(config.vocab_size,
-                                    config.n_embd,
-                                    initializer_range=config.initializer_range,
-                                    name="w")
+        self.w = TFSharedEmbeddings(
+            config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="w"
+        )
 
         self.dropout = tf.keras.layers.Dropout(config.embd_pdrop)
-        self.h = [TFEncoderLayer(config.n_embd,
-                                 config.n_head,
-                                 config.dff,
-                                 config.resid_pdrop,
-                                 config.layer_norm_epsilon,
-                                 config.output_attentions,
-                                 name='h_._{}'.format(i)) for i in range(config.n_layer)]
+        self.h = [
+            TFEncoderLayer(
+                config.n_embd,
+                config.n_head,
+                config.dff,
+                config.resid_pdrop,
+                config.layer_norm_epsilon,
+                config.output_attentions,
+                name="h_._{}".format(i),
+            )
+            for i in range(config.n_layer)
+        ]
         self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm")
 
     def get_input_embeddings(self):
@@ -204,7 +209,17 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
         """
         raise NotImplementedError
 
-    def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
+    def call(
+        self,
+        inputs,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             past = inputs[1] if len(inputs) > 1 else past
@@ -215,13 +230,13 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
             inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
             assert len(inputs) <= 7, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            past = inputs.get('past', past)
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            past = inputs.get("past", past)
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 7, "Too many inputs."
         else:
             input_ids = inputs
@@ -276,14 +291,14 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
 
         if token_type_ids is not None:
             token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            token_type_embeds = self.w(token_type_ids, mode='embedding')
+            token_type_embeds = self.w(token_type_ids, mode="embedding")
             token_type_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, tf.float32))
         else:
             token_type_embeds = 0
         position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
 
         if inputs_embeds is None:
-            inputs_embeds = self.w(input_ids, mode='embedding')
+            inputs_embeds = self.w(input_ids, mode="embedding")
         seq_len = input_shape[-1]
         mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
 
@@ -333,6 +348,7 @@ class TFCTRLPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = CTRLConfig
     pretrained_model_archive_map = TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
@@ -392,8 +408,12 @@ CTRL_INPUTS_DOCSTRING = r"""    Inputs:
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
-                                            CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
+    CTRL_START_DOCSTRING,
+    CTRL_INPUTS_DOCSTRING,
+)
 class TFCTRLModel(TFCTRLPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -423,9 +443,10 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFCTRLModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFCTRLMainLayer(config, name='transformer')
+        self.transformer = TFCTRLMainLayer(config, name="transformer")
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
@@ -442,10 +463,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='bias')
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
         super(TFCTRLLMHead, self).build(input_shape)
 
     def call(self, hidden_states):
@@ -454,8 +472,12 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
         return hidden_states
 
 
-@add_start_docstrings("""The CTRL Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """The CTRL Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """,
+    CTRL_START_DOCSTRING,
+    CTRL_INPUTS_DOCSTRING,
+)
 class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -486,9 +508,10 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFCTRLLMHeadModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFCTRLMainLayer(config, name='transformer')
+        self.transformer = TFCTRLMainLayer(config, name="transformer")
 
         self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head")
 
diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py
index afd88d7ebf..e9e89d2e73 100644
--- a/transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
@@ -36,9 +36,9 @@ logger = logging.getLogger(__name__)
 
 
 TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
-    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5",
-    'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-tf_model.h5",
+    "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
+    "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5",
+    "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-tf_model.h5",
 }
 
 
@@ -53,6 +53,7 @@ def gelu(x):
     cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
     return x * cdf
 
+
 def gelu_new(x):
     """Gaussian Error Linear Unit.
     This is a smoother version of the RELU.
@@ -62,24 +63,25 @@ def gelu_new(x):
     Returns:
         `x` with the GELU activation applied.
     """
-    cdf = 0.5 * (1.0 + tf.tanh(
-        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
     return x * cdf
 
+
 class TFEmbeddings(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFEmbeddings, self).__init__(**kwargs)
         self.vocab_size = config.vocab_size
         self.dim = config.dim
         self.initializer_range = config.initializer_range
-        self.word_embeddings = TFSharedEmbeddings(config.vocab_size,
-                                                  config.dim,
-                                                  initializer_range=config.initializer_range,
-                                                  name='word_embeddings')  # padding_idx=0)
-        self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
-                                                             config.dim,
-                                                             embeddings_initializer=get_initializer(config.initializer_range),
-                                                             name='position_embeddings')
+        self.word_embeddings = TFSharedEmbeddings(
+            config.vocab_size, config.dim, initializer_range=config.initializer_range, name="word_embeddings"
+        )  # padding_idx=0)
+        self.position_embeddings = tf.keras.layers.Embedding(
+            config.max_position_embeddings,
+            config.dim,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="position_embeddings",
+        )
         if config.sinusoidal_pos_embds:
             raise NotImplementedError
 
@@ -92,9 +94,8 @@ class TFEmbeddings(tf.keras.layers.Layer):
             # Create and initialize weights. The random normal initializer was chosen
             # arbitrarily, and works well.
             self.word_embeddings = self.add_weight(
-                "weight",
-                shape=[self.vocab_size, self.dim],
-                initializer=get_initializer(self.initializer_range))
+                "weight", shape=[self.vocab_size, self.dim], initializer=get_initializer(self.initializer_range)
+            )
         super(TFEmbeddings, self).build(input_shape)
 
     def call(self, inputs, inputs_embeds=None, mode="embedding", training=False):
@@ -149,9 +150,9 @@ class TFEmbeddings(tf.keras.layers.Layer):
             inputs_embeds = tf.gather(self.word_embeddings, input_ids)
         position_embeddings = self.position_embeddings(position_ids)  # (bs, max_seq_length, dim)
 
-        embeddings = inputs_embeds + position_embeddings              # (bs, max_seq_length, dim)
-        embeddings = self.LayerNorm(embeddings)                       # (bs, max_seq_length, dim)
-        embeddings = self.dropout(embeddings, training=training)      # (bs, max_seq_length, dim)
+        embeddings = inputs_embeds + position_embeddings  # (bs, max_seq_length, dim)
+        embeddings = self.LayerNorm(embeddings)  # (bs, max_seq_length, dim)
+        embeddings = self.dropout(embeddings, training=training)  # (bs, max_seq_length, dim)
         return embeddings
 
     def _linear(self, inputs):
@@ -181,18 +182,18 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
 
         assert self.dim % self.n_heads == 0
 
-        self.q_lin = tf.keras.layers.Dense(config.dim,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name="q_lin")
-        self.k_lin = tf.keras.layers.Dense(config.dim,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name="k_lin")
-        self.v_lin = tf.keras.layers.Dense(config.dim,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name="v_lin")
-        self.out_lin = tf.keras.layers.Dense(config.dim,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name="out_lin")
+        self.q_lin = tf.keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="q_lin"
+        )
+        self.k_lin = tf.keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="k_lin"
+        )
+        self.v_lin = tf.keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="v_lin"
+        )
+        self.out_lin = tf.keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="out_lin"
+        )
 
         self.pruned_heads = set()
 
@@ -233,44 +234,49 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
             """ group heads """
             return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
 
-        q = shape(self.q_lin(query))           # (bs, n_heads, q_length, dim_per_head)
-        k = shape(self.k_lin(key))             # (bs, n_heads, k_length, dim_per_head)
-        v = shape(self.v_lin(value))           # (bs, n_heads, k_length, dim_per_head)
+        q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
+        k = shape(self.k_lin(key))  # (bs, n_heads, k_length, dim_per_head)
+        v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)
 
-        q = q / math.sqrt(dim_per_head)                     # (bs, n_heads, q_length, dim_per_head)
-        scores = tf.matmul(q, k, transpose_b=True)          # (bs, n_heads, q_length, k_length)
-        mask = tf.reshape(mask, mask_reshape)                           # (bs, n_heads, qlen, klen)
+        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
+        scores = tf.matmul(q, k, transpose_b=True)  # (bs, n_heads, q_length, k_length)
+        mask = tf.reshape(mask, mask_reshape)  # (bs, n_heads, qlen, klen)
         # scores.masked_fill_(mask, -float('inf'))            # (bs, n_heads, q_length, k_length)
         scores = scores - 1e30 * (1.0 - mask)
 
-        weights = tf.nn.softmax(scores, axis=-1)                              # (bs, n_heads, qlen, klen)
-        weights = self.dropout(weights, training=training)                    # (bs, n_heads, qlen, klen)
+        weights = tf.nn.softmax(scores, axis=-1)  # (bs, n_heads, qlen, klen)
+        weights = self.dropout(weights, training=training)  # (bs, n_heads, qlen, klen)
 
         # Mask heads if we want to
         if head_mask is not None:
             weights = weights * head_mask
 
-        context = tf.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
-        context = unshape(context)             # (bs, q_length, dim)
-        context = self.out_lin(context)        # (bs, q_length, dim)
+        context = tf.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)  # (bs, q_length, dim)
+        context = self.out_lin(context)  # (bs, q_length, dim)
 
         if self.output_attentions:
             return (context, weights)
         else:
             return (context,)
 
+
 class TFFFN(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFFFN, self).__init__(**kwargs)
         self.dropout = tf.keras.layers.Dropout(config.dropout)
-        self.lin1 = tf.keras.layers.Dense(config.hidden_dim,
-                                          kernel_initializer=get_initializer(config.initializer_range),
-                                          name="lin1")
-        self.lin2 = tf.keras.layers.Dense(config.dim,
-                                          kernel_initializer=get_initializer(config.initializer_range),
-                                          name="lin2")
-        assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation)
-        self.activation = tf.keras.layers.Activation(gelu) if config.activation=='gelu' else tf.keras.activations.relu
+        self.lin1 = tf.keras.layers.Dense(
+            config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1"
+        )
+        self.lin2 = tf.keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2"
+        )
+        assert config.activation in ["relu", "gelu"], "activation ({}) must be in ['relu', 'gelu']".format(
+            config.activation
+        )
+        self.activation = (
+            tf.keras.layers.Activation(gelu) if config.activation == "gelu" else tf.keras.activations.relu
+        )
 
     def call(self, input, training=False):
         x = self.lin1(input)
@@ -318,14 +324,14 @@ class TFTransformerBlock(tf.keras.layers.Layer):
         # Self-Attention
         sa_output = self.attention([x, x, x, attn_mask, head_mask], training=training)
         if self.output_attentions:
-            sa_output, sa_weights = sa_output                  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
-        else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples
+            sa_output, sa_weights = sa_output  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
+        else:  # To handle these `output_attention` or `output_hidden_states` cases returning tuples
             # assert type(sa_output) == tuple
             sa_output = sa_output[0]
-        sa_output = self.sa_layer_norm(sa_output + x)          # (bs, seq_length, dim)
+        sa_output = self.sa_layer_norm(sa_output + x)  # (bs, seq_length, dim)
 
         # Feed Forward Network
-        ffn_output = self.ffn(sa_output, training=training)                             # (bs, seq_length, dim)
+        ffn_output = self.ffn(sa_output, training=training)  # (bs, seq_length, dim)
         ffn_output = self.output_layer_norm(ffn_output + sa_output)  # (bs, seq_length, dim)
 
         output = (ffn_output,)
@@ -341,8 +347,7 @@ class TFTransformer(tf.keras.layers.Layer):
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
 
-        self.layer = [TFTransformerBlock(config, name='layer_._{}'.format(i))
-                      for i in range(config.n_layers)]
+        self.layer = [TFTransformerBlock(config, name="layer_._{}".format(i)) for i in range(config.n_layers)]
 
     def call(self, inputs, training=False):
         """
@@ -401,8 +406,8 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
         super(TFDistilBertMainLayer, self).__init__(**kwargs)
         self.num_hidden_layers = config.num_hidden_layers
 
-        self.embeddings = TFEmbeddings(config, name="embeddings")   # Embeddings
-        self.transformer = TFTransformer(config, name="transformer") # Encoder
+        self.embeddings = TFEmbeddings(config, name="embeddings")  # Embeddings
+        self.transformer = TFTransformer(config, name="transformer")  # Encoder
 
     def get_input_embeddings(self):
         return self.embeddings
@@ -421,10 +426,10 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
             inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
             assert len(inputs) <= 4, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 4, "Too many inputs."
         else:
             input_ids = inputs
@@ -439,7 +444,7 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         if attention_mask is None:
-            attention_mask = tf.ones(input_shape) # (bs, seq_length)
+            attention_mask = tf.ones(input_shape)  # (bs, seq_length)
         attention_mask = tf.cast(attention_mask, dtype=tf.float32)
 
         # Prepare head mask if needed
@@ -452,10 +457,10 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
         else:
             head_mask = [None] * self.num_hidden_layers
 
-        embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds)   # (bs, seq_length, dim)
+        embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds)  # (bs, seq_length, dim)
         tfmr_output = self.transformer([embedding_output, attention_mask, head_mask], training=training)
 
-        return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions)
+        return tfmr_output  # last-layer hidden-state, (all hidden_states), (all attentions)
 
 
 ### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ###
@@ -463,6 +468,7 @@ class TFDistilBertPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for downloading and loading pretrained models.
     """
+
     config_class = DistilBertConfig
     pretrained_model_archive_map = TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "distilbert"
@@ -534,8 +540,12 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class TFDistilBertModel(TFDistilBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -561,9 +571,10 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFDistilBertModel, self).__init__(config, *inputs, **kwargs)
-        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")   # Embeddings
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")  # Embeddings
 
     def call(self, inputs, **kwargs):
         outputs = self.distilbert(inputs, **kwargs)
@@ -580,10 +591,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='bias')
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
         super(TFDistilBertLMHead, self).build(input_shape)
 
     def call(self, hidden_states):
@@ -592,8 +600,11 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
         return hidden_states
 
 
-@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """,
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """DistilBert Model with a `masked language modeling` head on top. """,
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -619,6 +630,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
         prediction_scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFDistilBertForMaskedLM, self).__init__(config, *inputs, **kwargs)
         self.output_attentions = config.output_attentions
@@ -626,9 +638,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
         self.vocab_size = config.vocab_size
 
         self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
-        self.vocab_transform = tf.keras.layers.Dense(config.dim,
-                                                     kernel_initializer=get_initializer(config.initializer_range),
-                                                     name="vocab_transform")
+        self.vocab_transform = tf.keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="vocab_transform"
+        )
         self.act = tf.keras.layers.Activation(gelu)
         self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
         self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")
@@ -639,9 +651,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
     def call(self, inputs, **kwargs):
         distilbert_output = self.distilbert(inputs, **kwargs)
 
-        hidden_states = distilbert_output[0]                               # (bs, seq_length, dim)
-        prediction_logits = self.vocab_transform(hidden_states)       # (bs, seq_length, dim)
-        prediction_logits = self.act(prediction_logits)               # (bs, seq_length, dim)
+        hidden_states = distilbert_output[0]  # (bs, seq_length, dim)
+        prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)
+        prediction_logits = self.act(prediction_logits)  # (bs, seq_length, dim)
         prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
         prediction_logits = self.vocab_projector(prediction_logits)
 
@@ -649,9 +661,12 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
         return outputs  # logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
                          the pooled output) e.g. for GLUE tasks. """,
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -677,36 +692,42 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
         logits = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFDistilBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
         self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
-        self.pre_classifier = tf.keras.layers.Dense(config.dim,
-                                                    kernel_initializer=get_initializer(config.initializer_range),
-                                                    activation='relu',
-                                                    name="pre_classifier")
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name="classifier")
+        self.pre_classifier = tf.keras.layers.Dense(
+            config.dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="relu",
+            name="pre_classifier",
+        )
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
         self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
 
     def call(self, inputs, **kwargs):
         distilbert_output = self.distilbert(inputs, **kwargs)
 
-        hidden_state = distilbert_output[0]                    # (bs, seq_len, dim)
-        pooled_output = hidden_state[:, 0]                    # (bs, dim)
-        pooled_output = self.pre_classifier(pooled_output)   # (bs, dim)
-        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))         # (bs, dim)
-        logits = self.classifier(pooled_output)              # (bs, dim)
+        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
+        pooled_output = hidden_state[:, 0]  # (bs, dim)
+        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
+        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))  # (bs, dim)
+        logits = self.classifier(pooled_output)  # (bs, dim)
 
         outputs = (logits,) + distilbert_output[1:]
         return outputs  # logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""DistilBert Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """DistilBert Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -728,22 +749,23 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
         outputs = model(input_ids)
         scores = outputs[0]
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFDistilBertForTokenClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.distilbert = TFDistilBertMainLayer(config, name='distilbert')
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
         self.dropout = tf.keras.layers.Dropout(config.dropout)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
 
     def call(self, inputs, **kwargs):
         outputs = self.distilbert(inputs, **kwargs)
 
         sequence_output = outputs[0]
 
-        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
+        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
         logits = self.classifier(sequence_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
@@ -751,9 +773,12 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
         return outputs  # scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
                          the hidden-states output to compute `span start logits` and `span end logits`). """,
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -781,22 +806,23 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
         start_scores, end_scores = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFDistilBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
 
         self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
-        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='qa_outputs')
+        self.qa_outputs = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
         assert config.num_labels == 2
         self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
 
     def call(self, inputs, **kwargs):
         distilbert_output = self.distilbert(inputs, **kwargs)
 
-        hidden_states = distilbert_output[0]                                 # (bs, max_query_len, dim)
-        hidden_states = self.dropout(hidden_states, training=kwargs.get('training', False))                       # (bs, max_query_len, dim)
-        logits = self.qa_outputs(hidden_states)                           # (bs, max_query_len, 2)
+        hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)
+        hidden_states = self.dropout(hidden_states, training=kwargs.get("training", False))  # (bs, max_query_len, dim)
+        logits = self.qa_outputs(hidden_states)  # (bs, max_query_len, 2)
         start_logits, end_logits = tf.split(logits, 2, axis=-1)
         start_logits = tf.squeeze(start_logits, axis=-1)
         end_logits = tf.squeeze(end_logits, axis=-1)
diff --git a/transformers/modeling_tf_gpt2.py b/transformers/modeling_tf_gpt2.py
index 718e8f6058..a4722fb343 100644
--- a/transformers/modeling_tf_gpt2.py
+++ b/transformers/modeling_tf_gpt2.py
@@ -28,17 +28,25 @@ from io import open
 import numpy as np
 import tensorflow as tf
 
-from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
-                                TFSequenceSummary, shape_list, get_initializer)
+from .modeling_tf_utils import (
+    TFPreTrainedModel,
+    TFConv1D,
+    TFSharedEmbeddings,
+    TFSequenceSummary,
+    shape_list,
+    get_initializer,
+)
 from .configuration_gpt2 import GPT2Config
 from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
-TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
-                                     "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5",
-                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5",
-                                     "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5",}
+TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
+    "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5",
+    "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5",
+    "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5",
+}
 
 
 def gelu(x):
@@ -50,8 +58,7 @@ def gelu(x):
     Returns:
         `x` with the GELU activation applied.
     """
-    cdf = 0.5 * (1.0 + tf.tanh(
-        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
     return x * cdf
 
 
@@ -68,8 +75,8 @@ class TFAttention(tf.keras.layers.Layer):
         self.split_size = n_state
         self.scale = scale
 
-        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn')
-        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj')
+        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn")
+        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
         self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
         self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
         self.pruned_heads = set()
@@ -82,7 +89,7 @@ class TFAttention(tf.keras.layers.Layer):
         """1's in the lower triangle, counting from the lower right corner.
         Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
         """
-        i = tf.range(nd)[:,None]
+        i = tf.range(nd)[:, None]
         j = tf.range(ns)
         m = i >= j - ns + nd
         return tf.cast(m, dtype)
@@ -92,7 +99,7 @@ class TFAttention(tf.keras.layers.Layer):
         # q, k, v have shape [batch, heads, sequence, features]
         w = tf.matmul(q, k, transpose_b=True)
         if self.scale:
-            dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores
+            dk = tf.cast(shape_list(k)[-1], tf.float32)  # scale attention_scores
             w = w / tf.math.sqrt(dk)
 
         # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
@@ -158,8 +165,8 @@ class TFMLP(tf.keras.layers.Layer):
     def __init__(self, n_state, config, **kwargs):
         super(TFMLP, self).__init__(**kwargs)
         nx = config.n_embd
-        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc')
-        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj')
+        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
+        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
         self.act = gelu
         self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
 
@@ -174,10 +181,10 @@ class TFBlock(tf.keras.layers.Layer):
     def __init__(self, n_ctx, config, scale=False, **kwargs):
         super(TFBlock, self).__init__(**kwargs)
         nx = config.n_embd
-        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_1')
-        self.attn = TFAttention(nx, n_ctx, config, scale, name='attn')
-        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_2')
-        self.mlp = TFMLP(4 * nx, config, name='mlp')
+        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
+        self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
+        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
+        self.mlp = TFMLP(4 * nx, config, name="mlp")
 
     def call(self, inputs, training=False):
         x, layer_past, attention_mask, head_mask = inputs
@@ -204,20 +211,18 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
         self.vocab_size = config.vocab_size
         self.n_embd = config.n_embd
 
-        self.wte = TFSharedEmbeddings(config.vocab_size,
-                                      config.hidden_size,
-                                      initializer_range=config.initializer_range,
-                                      name='wte')
-        self.wpe = tf.keras.layers.Embedding(config.n_positions,
-                                             config.n_embd,
-                                             embeddings_initializer=get_initializer(config.initializer_range),
-                                             name='wpe')
+        self.wte = TFSharedEmbeddings(
+            config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte"
+        )
+        self.wpe = tf.keras.layers.Embedding(
+            config.n_positions,
+            config.n_embd,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="wpe",
+        )
         self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
-        self.h = [TFBlock(config.n_ctx,
-                          config,
-                          scale=True,
-                          name='h_._{}'.format(i)) for i in range(config.n_layer)]
-        self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f')
+        self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.n_layer)]
+        self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f")
 
     def get_input_embeddings(self):
         return self.wte
@@ -231,7 +236,17 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
         """
         raise NotImplementedError
 
-    def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
+    def call(
+        self,
+        inputs,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             past = inputs[1] if len(inputs) > 1 else past
@@ -242,13 +257,13 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
             inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
             assert len(inputs) <= 7, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            past = inputs.get('past', past)
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            past = inputs.get("past", past)
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 7, "Too many inputs."
         else:
             input_ids = inputs
@@ -304,11 +319,11 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
         position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
 
         if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids, mode='embedding')
+            inputs_embeds = self.wte(input_ids, mode="embedding")
         position_embeds = self.wpe(position_ids)
         if token_type_ids is not None:
             token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            token_type_embeds = self.wte(token_type_ids, mode='embedding')
+            token_type_embeds = self.wte(token_type_ids, mode="embedding")
         else:
             token_type_embeds = 0
         hidden_states = inputs_embeds + position_embeds + token_type_embeds
@@ -353,6 +368,7 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = GPT2Config
     pretrained_model_archive_map = TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
@@ -428,8 +444,12 @@ GPT2_INPUTS_DOCSTRING = r"""    Inputs:
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
-                      GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
+    GPT2_START_DOCSTRING,
+    GPT2_INPUTS_DOCSTRING,
+)
 class TFGPT2Model(TFGPT2PreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -459,17 +479,22 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFGPT2Model, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFGPT2MainLayer(config, name='transformer')
+        self.transformer = TFGPT2MainLayer(config, name="transformer")
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
         return outputs
 
 
-@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """The GPT2 Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """,
+    GPT2_START_DOCSTRING,
+    GPT2_INPUTS_DOCSTRING,
+)
 class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -500,9 +525,10 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
         logits = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFGPT2LMHeadModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFGPT2MainLayer(config, name='transformer')
+        self.transformer = TFGPT2MainLayer(config, name="transformer")
 
     def get_output_embeddings(self):
         return self.transformer.wte
@@ -518,11 +544,15 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
         return outputs  # lm_logits, presents, (all hidden_states), (attentions)
 
 
-@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
+@add_start_docstrings(
+    """The GPT2 Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
-""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+""",
+    GPT2_START_DOCSTRING,
+    GPT2_INPUTS_DOCSTRING,
+)
 class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
     r"""
         **mc_token_ids**: (`optional`, default to index of the last token of the input) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)``:
@@ -572,16 +602,30 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
         lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
         config.num_labels = 1
-        self.transformer = TFGPT2MainLayer(config, name='transformer')
-        self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
+        self.transformer = TFGPT2MainLayer(config, name="transformer")
+        self.multiple_choice_head = TFSequenceSummary(
+            config, initializer_range=config.initializer_range, name="multiple_choice_head"
+        )
 
     def get_output_embeddings(self):
         return self.transformer.wte
 
-    def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, training=False):
+    def call(
+        self,
+        inputs,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        mc_token_ids=None,
+        training=False,
+    ):
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             past = inputs[1] if len(inputs) > 1 else past
@@ -593,14 +637,14 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
             mc_token_ids = inputs[7] if len(inputs) > 7 else mc_token_ids
             assert len(inputs) <= 8, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            past = inputs.get('past', past)
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
-            mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
+            input_ids = inputs.get("input_ids")
+            past = inputs.get("past", past)
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            mc_token_ids = inputs.get("mc_token_ids", mc_token_ids)
             assert len(inputs) <= 8, "Too many inputs."
         else:
             input_ids = inputs
@@ -617,7 +661,15 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
         flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
         flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
 
-        flat_inputs = [flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds]
+        flat_inputs = [
+            flat_input_ids,
+            past,
+            flat_attention_mask,
+            flat_token_type_ids,
+            flat_position_ids,
+            head_mask,
+            inputs_embeds,
+        ]
 
         transformer_outputs = self.transformer(flat_inputs, training=training)
         hidden_states = transformer_outputs[0]
diff --git a/transformers/modeling_tf_openai.py b/transformers/modeling_tf_openai.py
index 791c6dcc18..4720e3c5db 100644
--- a/transformers/modeling_tf_openai.py
+++ b/transformers/modeling_tf_openai.py
@@ -28,14 +28,22 @@ from io import open
 import numpy as np
 import tensorflow as tf
 
-from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
-                                TFSequenceSummary, shape_list, get_initializer)
+from .modeling_tf_utils import (
+    TFPreTrainedModel,
+    TFConv1D,
+    TFSharedEmbeddings,
+    TFSequenceSummary,
+    shape_list,
+    get_initializer,
+)
 from .configuration_openai import OpenAIGPTConfig
 from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
-TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5"}
+TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5"
+}
 
 
 def gelu(x):
@@ -47,8 +55,7 @@ def gelu(x):
     Returns:
         `x` with the GELU activation applied.
     """
-    cdf = 0.5 * (1.0 + tf.tanh(
-        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
     return x * cdf
 
 
@@ -56,9 +63,11 @@ def swish(x):
     return x * tf.math.sigmoid(x)
 
 
-ACT_FNS = {"gelu": tf.keras.layers.Activation(gelu),
-           "relu": tf.keras.activations.relu,
-           "swish": tf.keras.layers.Activation(swish)}
+ACT_FNS = {
+    "gelu": tf.keras.layers.Activation(gelu),
+    "relu": tf.keras.activations.relu,
+    "swish": tf.keras.layers.Activation(swish),
+}
 
 
 class TFAttention(tf.keras.layers.Layer):
@@ -74,8 +83,8 @@ class TFAttention(tf.keras.layers.Layer):
         self.split_size = n_state
         self.scale = scale
 
-        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn')
-        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj')
+        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn")
+        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
         self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
         self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
         self.pruned_heads = set()
@@ -88,7 +97,7 @@ class TFAttention(tf.keras.layers.Layer):
         """1's in the lower triangle, counting from the lower right corner.
         Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
         """
-        i = tf.range(nd)[:,None]
+        i = tf.range(nd)[:, None]
         j = tf.range(ns)
         m = i >= j - ns + nd
         return tf.cast(m, dtype)
@@ -98,7 +107,7 @@ class TFAttention(tf.keras.layers.Layer):
         # q, k, v have shape [batch, heads, sequence, features]
         w = tf.matmul(q, k, transpose_b=True)
         if self.scale:
-            dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores
+            dk = tf.cast(shape_list(k)[-1], tf.float32)  # scale attention_scores
             w = w / tf.math.sqrt(dk)
 
         # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
@@ -159,8 +168,8 @@ class TFMLP(tf.keras.layers.Layer):
     def __init__(self, n_state, config, **kwargs):
         super(TFMLP, self).__init__(**kwargs)
         nx = config.n_embd
-        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc')
-        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj')
+        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
+        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
         self.act = gelu
         self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
 
@@ -175,10 +184,10 @@ class TFBlock(tf.keras.layers.Layer):
     def __init__(self, n_ctx, config, scale=False, **kwargs):
         super(TFBlock, self).__init__(**kwargs)
         nx = config.n_embd
-        self.attn = TFAttention(nx, n_ctx, config, scale, name='attn')
-        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_1')
-        self.mlp = TFMLP(4 * nx, config, name='mlp')
-        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_2')
+        self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
+        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
+        self.mlp = TFMLP(4 * nx, config, name="mlp")
+        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
 
     def call(self, inputs, training=False):
         x, attention_mask, head_mask = inputs
@@ -203,19 +212,17 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
         self.vocab_size = config.vocab_size
         self.n_embd = config.n_embd
 
-        self.tokens_embed = TFSharedEmbeddings(config.vocab_size,
-                                               config.n_embd,
-                                               initializer_range=config.initializer_range,
-                                               name='tokens_embed')
-        self.positions_embed = tf.keras.layers.Embedding(config.n_positions,
-                                                         config.n_embd,
-                                                         embeddings_initializer=get_initializer(config.initializer_range),
-                                                         name='positions_embed')
+        self.tokens_embed = TFSharedEmbeddings(
+            config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="tokens_embed"
+        )
+        self.positions_embed = tf.keras.layers.Embedding(
+            config.n_positions,
+            config.n_embd,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="positions_embed",
+        )
         self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
-        self.h = [TFBlock(config.n_ctx,
-                          config,
-                          scale=True,
-                          name='h_._{}'.format(i)) for i in range(config.n_layer)]
+        self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.n_layer)]
 
     def get_input_embeddings(self):
         return self.tokens_embed
@@ -229,7 +236,16 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
         """
         raise NotImplementedError
 
-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -239,12 +255,12 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
             inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
             assert len(inputs) <= 6, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 6, "Too many inputs."
         else:
             input_ids = inputs
@@ -295,11 +311,11 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
         position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
 
         if inputs_embeds is None:
-            inputs_embeds = self.tokens_embed(input_ids, mode='embedding')
+            inputs_embeds = self.tokens_embed(input_ids, mode="embedding")
         position_embeds = self.positions_embed(position_ids)
         if token_type_ids is not None:
             token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            token_type_embeds = self.tokens_embed(token_type_ids, mode='embedding')
+            token_type_embeds = self.tokens_embed(token_type_ids, mode="embedding")
         else:
             token_type_embeds = 0
         hidden_states = inputs_embeds + position_embeds + token_type_embeds
@@ -338,6 +354,7 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = OpenAIGPTConfig
     pretrained_model_archive_map = TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
@@ -409,8 +426,12 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
-                      OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
+    OPENAI_GPT_START_DOCSTRING,
+    OPENAI_GPT_INPUTS_DOCSTRING,
+)
 class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -436,17 +457,22 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFOpenAIGPTModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
+        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
         return outputs
 
 
-@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """OpenAI GPT Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """,
+    OPENAI_GPT_START_DOCSTRING,
+    OPENAI_GPT_INPUTS_DOCSTRING,
+)
 class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -472,9 +498,10 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
         logits = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFOpenAIGPTLMHeadModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
+        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
 
     def get_output_embeddings(self):
         return self.transformer.tokens_embed
@@ -490,11 +517,15 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
         return outputs  # lm_logits, (all hidden_states), (attentions)
 
 
-@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
+@add_start_docstrings(
+    """OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
-""", OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+""",
+    OPENAI_GPT_START_DOCSTRING,
+    OPENAI_GPT_INPUTS_DOCSTRING,
+)
 class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
     r"""
         **mc_token_ids**: (`optional`, default to index of the last token of the input) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)``:
@@ -536,16 +567,29 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
         lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
         config.num_labels = 1
-        self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
-        self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
+        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
+        self.multiple_choice_head = TFSequenceSummary(
+            config, initializer_range=config.initializer_range, name="multiple_choice_head"
+        )
 
     def get_output_embeddings(self):
         return self.transformer.tokens_embed
 
-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, training=False):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        mc_token_ids=None,
+        training=False,
+    ):
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -556,13 +600,13 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
             mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
             assert len(inputs) <= 7, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
-            mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            mc_token_ids = inputs.get("mc_token_ids", mc_token_ids)
             assert len(inputs) <= 7, "Too many inputs."
         else:
             input_ids = inputs
@@ -579,7 +623,14 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
         flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
         flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
 
-        flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds]
+        flat_inputs = [
+            flat_input_ids,
+            flat_attention_mask,
+            flat_token_type_ids,
+            flat_position_ids,
+            head_mask,
+            inputs_embeds,
+        ]
 
         transformer_outputs = self.transformer(flat_inputs, training=training)
         hidden_states = transformer_outputs[0]
diff --git a/transformers/modeling_tf_pytorch_utils.py b/transformers/modeling_tf_pytorch_utils.py
index 190caff18d..d1073d23a3 100644
--- a/transformers/modeling_tf_pytorch_utils.py
+++ b/transformers/modeling_tf_pytorch_utils.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """ PyTorch - TF 2.0 general utilities."""
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 import os
@@ -25,7 +24,8 @@ import numpy
 
 logger = logging.getLogger(__name__)
 
-def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=''):
+
+def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=""):
     """ Convert a TF 2.0 model variable name in a pytorch model weight name.
 
         Conventions for TF2.0 scopes -> PyTorch attribute names conversions:
@@ -36,26 +36,30 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove='')
             - pytorch model weight name
             - transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each other
     """
-    tf_name = tf_name.replace(':0', '')                       # device ids
-    tf_name = re.sub(r'/[^/]*___([^/]*)/', r'/\1/', tf_name)  # '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
-    tf_name = tf_name.replace('_._', '/')                     # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
-    tf_name = re.sub(r'//+', '/', tf_name)                    # Remove empty levels at the end
-    tf_name = tf_name.split('/')                              # Convert from TF2.0 '/' separators to PyTorch '.' separators
-    tf_name = tf_name[1:]                                     # Remove level zero
+    tf_name = tf_name.replace(":0", "")  # device ids
+    tf_name = re.sub(
+        r"/[^/]*___([^/]*)/", r"/\1/", tf_name
+    )  # '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
+    tf_name = tf_name.replace(
+        "_._", "/"
+    )  # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
+    tf_name = re.sub(r"//+", "/", tf_name)  # Remove empty levels at the end
+    tf_name = tf_name.split("/")  # Convert from TF2.0 '/' separators to PyTorch '.' separators
+    tf_name = tf_name[1:]  # Remove level zero
 
     # When should we transpose the weights
-    transpose = bool(tf_name[-1] == 'kernel' or 'emb_projs' in tf_name or 'out_projs' in tf_name)
+    transpose = bool(tf_name[-1] == "kernel" or "emb_projs" in tf_name or "out_projs" in tf_name)
 
     # Convert standard TF2.0 names in PyTorch names
-    if tf_name[-1] == 'kernel' or tf_name[-1] == 'embeddings' or tf_name[-1] == 'gamma':
-        tf_name[-1] = 'weight'
-    if tf_name[-1] == 'beta':
-        tf_name[-1] = 'bias'
+    if tf_name[-1] == "kernel" or tf_name[-1] == "embeddings" or tf_name[-1] == "gamma":
+        tf_name[-1] = "weight"
+    if tf_name[-1] == "beta":
+        tf_name[-1] = "bias"
 
     # Remove prefix if needed
-    tf_name = '.'.join(tf_name)
+    tf_name = ".".join(tf_name)
     if start_prefix_to_remove:
-        tf_name = tf_name.replace(start_prefix_to_remove, '', 1)
+        tf_name = tf_name.replace(start_prefix_to_remove, "", 1)
 
     return tf_name, transpose
 
@@ -63,6 +67,7 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove='')
 #####################
 ### PyTorch => TF 2.0
 
+
 def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
     """ Load pytorch checkpoints in a TF 2.0 model
     """
@@ -70,17 +75,21 @@ def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_i
         import tensorflow as tf
         import torch
     except ImportError as e:
-        logger.error("Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
-            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
+            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise e
 
     pt_path = os.path.abspath(pytorch_checkpoint_path)
     logger.info("Loading PyTorch weights from {}".format(pt_path))
 
-    pt_state_dict = torch.load(pt_path, map_location='cpu')
+    pt_state_dict = torch.load(pt_path, map_location="cpu")
     logger.info("PyTorch checkpoint contains {:,} parameters".format(sum(t.numel() for t in pt_state_dict.values())))
 
-    return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)
+    return load_pytorch_weights_in_tf2_model(
+        tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys
+    )
 
 
 def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_missing_keys=False):
@@ -88,7 +97,9 @@ def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_mi
     """
     pt_state_dict = pt_model.state_dict()
 
-    return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)
+    return load_pytorch_weights_in_tf2_model(
+        tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys
+    )
 
 
 def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, allow_missing_keys=False):
@@ -99,8 +110,10 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
         import tensorflow as tf
         from tensorflow.python.keras import backend as K
     except ImportError as e:
-        logger.error("Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
-            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
+            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise e
 
     if tf_inputs is None:
@@ -115,10 +128,10 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
     new_keys = []
     for key in pt_state_dict.keys():
         new_key = None
-        if 'gamma' in key:
-            new_key = key.replace('gamma', 'weight')
-        if 'beta' in key:
-            new_key = key.replace('beta', 'bias')
+        if "gamma" in key:
+            new_key = key.replace("gamma", "weight")
+        if "beta" in key:
+            new_key = key.replace("beta", "bias")
         if new_key:
             old_keys.append(key)
             new_keys.append(new_key)
@@ -127,9 +140,9 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
 
     # Make sure we are able to load PyTorch base models as well as derived models (with heads)
     # TF models always have a prefix, some of PyTorch models (base ones) don't
-    start_prefix_to_remove = ''
+    start_prefix_to_remove = ""
     if not any(s.startswith(tf_model.base_model_prefix) for s in pt_state_dict.keys()):
-        start_prefix_to_remove = tf_model.base_model_prefix + '.'
+        start_prefix_to_remove = tf_model.base_model_prefix + "."
 
     symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
     tf_loaded_numel = 0
@@ -137,7 +150,9 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
     all_pytorch_weights = set(list(pt_state_dict.keys()))
     for symbolic_weight in symbolic_weights:
         sw_name = symbolic_weight.name
-        name, transpose = convert_tf_weight_name_to_pt_weight_name(sw_name, start_prefix_to_remove=start_prefix_to_remove)
+        name, transpose = convert_tf_weight_name_to_pt_weight_name(
+            sw_name, start_prefix_to_remove=start_prefix_to_remove
+        )
 
         # Find associated numpy array in pytorch model state dict
         if name not in pt_state_dict:
@@ -182,6 +197,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
 #####################
 ### TF 2.0 => PyTorch
 
+
 def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
     """ Load TF 2.0 HDF5 checkpoint in a PyTorch model
         We use HDF5 to easily do transfer learning
@@ -191,8 +207,10 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
         import tensorflow as tf
         import torch
     except ImportError as e:
-        logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
-            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
+            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise e
 
     import transformers
@@ -215,6 +233,7 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
 
     return load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=allow_missing_keys)
 
+
 def load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=False):
     """ Load TF 2.0 model in a pytorch model
     """
@@ -230,8 +249,10 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
         import tensorflow as tf
         import torch
     except ImportError as e:
-        logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
-            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
+            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise e
 
     new_pt_params_dict = {}
@@ -239,14 +260,16 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
 
     # Make sure we are able to load PyTorch base models as well as derived models (with heads)
     # TF models always have a prefix, some of PyTorch models (base ones) don't
-    start_prefix_to_remove = ''
+    start_prefix_to_remove = ""
     if not any(s.startswith(pt_model.base_model_prefix) for s in current_pt_params_dict.keys()):
-        start_prefix_to_remove = pt_model.base_model_prefix + '.'
+        start_prefix_to_remove = pt_model.base_model_prefix + "."
 
     # Build a map from potential PyTorch weight names to TF 2.0 Variables
     tf_weights_map = {}
     for tf_weight in tf_weights:
-        pt_name, transpose = convert_tf_weight_name_to_pt_weight_name(tf_weight.name, start_prefix_to_remove=start_prefix_to_remove)
+        pt_name, transpose = convert_tf_weight_name_to_pt_weight_name(
+            tf_weight.name, start_prefix_to_remove=start_prefix_to_remove
+        )
         tf_weights_map[pt_name] = (tf_weight.numpy(), transpose)
 
     all_tf_weights = set(list(tf_weights_map.keys()))
@@ -291,11 +314,13 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
     missing_keys += missing_keys_pt
 
     if len(missing_keys) > 0:
-        logger.info("Weights of {} not initialized from TF 2.0 model: {}".format(
-            pt_model.__class__.__name__, missing_keys))
+        logger.info(
+            "Weights of {} not initialized from TF 2.0 model: {}".format(pt_model.__class__.__name__, missing_keys)
+        )
     if len(unexpected_keys) > 0:
-        logger.info("Weights from TF 2.0 model not used in {}: {}".format(
-            pt_model.__class__.__name__, unexpected_keys))
+        logger.info(
+            "Weights from TF 2.0 model not used in {}: {}".format(pt_model.__class__.__name__, unexpected_keys)
+        )
 
     logger.info("Weights or buffers not loaded from TF 2.0 model: {}".format(all_tf_weights))
 
diff --git a/transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py
index 15282bd6cc..136ab66157 100644
--- a/transformers/modeling_tf_roberta.py
+++ b/transformers/modeling_tf_roberta.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """ TF 2.0 RoBERTa model. """
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 
@@ -31,16 +30,18 @@ from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new
 logger = logging.getLogger(__name__)
 
 TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-tf_model.h5",
-    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tf_model.h5",
-    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-tf_model.h5",
-    'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-tf_model.h5",
+    "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-tf_model.h5",
+    "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tf_model.h5",
+    "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-tf_model.h5",
+    "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-tf_model.h5",
 }
 
+
 class TFRobertaEmbeddings(TFBertEmbeddings):
     """
     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
     """
+
     def __init__(self, config, **kwargs):
         super(TFRobertaEmbeddings, self).__init__(config, **kwargs)
         self.padding_idx = 1
@@ -64,9 +65,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
         """
         seq_length = shape_list(inputs_embeds)[1]
 
-        position_ids = tf.range(self.padding_idx + 1,
-                                seq_length + self.padding_idx + 1,
-                                dtype=tf.int32)[tf.newaxis, :]
+        position_ids = tf.range(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=tf.int32)[tf.newaxis, :]
         return position_ids
 
     def _embedding(self, inputs, training=False):
@@ -80,16 +79,19 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
             else:
                 position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
 
-        return super(TFRobertaEmbeddings, self)._embedding([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
+        return super(TFRobertaEmbeddings, self)._embedding(
+            [input_ids, position_ids, token_type_ids, inputs_embeds], training=training
+        )
 
 
 class TFRobertaMainLayer(TFBertMainLayer):
     """
     Same as TFBertMainLayer but uses TFRobertaEmbeddings.
     """
+
     def __init__(self, config, **kwargs):
         super(TFRobertaMainLayer, self).__init__(config, **kwargs)
-        self.embeddings = TFRobertaEmbeddings(config, name='embeddings')
+        self.embeddings = TFRobertaEmbeddings(config, name="embeddings")
 
     def get_input_embeddings(self):
         return self.embeddings
@@ -99,6 +101,7 @@ class TFRobertaPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = RobertaConfig
     pretrained_model_archive_map = TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "roberta"
@@ -192,8 +195,12 @@ ROBERTA_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.",
-                      ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.",
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class TFRobertaModel(TFRobertaPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -226,9 +233,10 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFRobertaModel, self).__init__(config, *inputs, **kwargs)
-        self.roberta = TFRobertaMainLayer(config, name='roberta')
+        self.roberta = TFRobertaMainLayer(config, name="roberta")
 
     def call(self, inputs, **kwargs):
         outputs = self.roberta(inputs, **kwargs)
@@ -237,13 +245,14 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
 
 class TFRobertaLMHead(tf.keras.layers.Layer):
     """Roberta Head for masked language modeling."""
+
     def __init__(self, config, input_embeddings, **kwargs):
         super(TFRobertaLMHead, self).__init__(**kwargs)
         self.vocab_size = config.vocab_size
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='dense')
-        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
         self.act = tf.keras.layers.Activation(gelu)
 
         # The output weights are the same as the input embeddings, but there is
@@ -251,10 +260,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
         self.decoder = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='bias')
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
         super(TFRobertaLMHead, self).build(input_shape)
 
     def call(self, features):
@@ -268,8 +274,9 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
         return x
 
 
-@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING
+)
 class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -297,6 +304,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
         prediction_scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFRobertaForMaskedLM, self).__init__(config, *inputs, **kwargs)
 
@@ -322,14 +330,16 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
 
     def __init__(self, config, **kwargs):
         super(TFRobertaClassificationHead, self).__init__(config, **kwargs)
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           activation='tanh',
-                                           name="dense")
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.out_proj = tf.keras.layers.Dense(config.num_labels,
-                                              kernel_initializer=get_initializer(config.initializer_range),
-                                              name="out_proj")
+        self.out_proj = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
+        )
 
     def call(self, features, training=False):
         x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
@@ -340,9 +350,12 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
         return x
 
 
-@add_start_docstrings("""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
+@add_start_docstrings(
+    """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
     on top of the pooled output) e.g. for GLUE tasks. """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -369,27 +382,31 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
         logits = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFRobertaForSequenceClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
         self.roberta = TFRobertaMainLayer(config, name="roberta")
         self.classifier = TFRobertaClassificationHead(config, name="classifier")
-    
+
     def call(self, inputs, **kwargs):
         outputs = self.roberta(inputs, **kwargs)
 
         sequence_output = outputs[0]
-        logits = self.classifier(sequence_output, training=kwargs.get('training', False))
+        logits = self.classifier(sequence_output, training=kwargs.get("training", False))
 
         outputs = (logits,) + outputs[2:]
 
         return outputs  # logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""RoBERTa Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """RoBERTa Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -415,22 +432,23 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
         scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFRobertaForTokenClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.roberta = TFRobertaMainLayer(config, name='roberta')
+        self.roberta = TFRobertaMainLayer(config, name="roberta")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
 
     def call(self, inputs, **kwargs):
         outputs = self.roberta(inputs, **kwargs)
 
         sequence_output = outputs[0]
 
-        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
+        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
         logits = self.classifier(sequence_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py
index e803e00c8d..38a2bf4190 100644
--- a/transformers/modeling_tf_t5.py
+++ b/transformers/modeling_tf_t5.py
@@ -31,11 +31,11 @@ from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK
 logger = logging.getLogger(__name__)
 
 TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5",
-    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-tf_model.h5",
-    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-tf_model.h5",
-    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-tf_model.h5",
-    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-tf_model.h5",
+    "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5",
+    "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-tf_model.h5",
+    "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-tf_model.h5",
+    "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-tf_model.h5",
+    "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-tf_model.h5",
 }
 
 ####################################################
@@ -44,6 +44,7 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
 # - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model)
 ####################################################
 
+
 class TFT5LayerNorm(tf.keras.layers.Layer):
     def __init__(self, epsilon=1e-6, **kwargs):
         """ Construct a layernorm module in the T5 style
@@ -54,10 +55,7 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
 
     def build(self, input_shape):
         """Build shared word embedding layer """
-        self.weight = self.add_weight(
-            "weight",
-            shape=(input_shape[-1],),
-            initializer='ones')
+        self.weight = self.add_weight("weight", shape=(input_shape[-1],), initializer="ones")
         super(TFT5LayerNorm, self).build(input_shape)
 
     def call(self, x):
@@ -69,8 +67,8 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
 class TFT5DenseReluDense(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFT5DenseReluDense, self).__init__(**kwargs)
-        self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name='wi')
-        self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name='wo')
+        self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name="wi")
+        self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name="wo")
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
         self.act = tf.keras.activations.relu
 
@@ -85,9 +83,8 @@ class TFT5DenseReluDense(tf.keras.layers.Layer):
 class TFT5LayerFF(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFT5LayerFF, self).__init__(**kwargs)
-        self.DenseReluDense = TFT5DenseReluDense(config, name='DenseReluDense')
-        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
-                                        name='layer_norm')
+        self.DenseReluDense = TFT5DenseReluDense(config, name="DenseReluDense")
+        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
 
     def call(self, hidden_states, training=False):
@@ -114,26 +111,23 @@ class TFT5Attention(tf.keras.layers.Layer):
         self.inner_dim = self.n_heads * self.d_kv
 
         # Mesh TensorFlow initialization to avoid scaling before softmax
-        self.q = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='q')
-        self.k = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='k')
-        self.v = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='v')
-        self.o = tf.keras.layers.Dense(self.d_model, use_bias=False, name='o')
+        self.q = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name="q")
+        self.k = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name="k")
+        self.v = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name="v")
+        self.o = tf.keras.layers.Dense(self.d_model, use_bias=False, name="o")
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
 
         if self.has_relative_attention_bias:
-            self.relative_attention_bias = tf.keras.layers.Embedding(self.relative_attention_num_buckets,
-                                                                     self.n_heads,
-                                                                     name='relative_attention_bias')
+            self.relative_attention_bias = tf.keras.layers.Embedding(
+                self.relative_attention_num_buckets, self.n_heads, name="relative_attention_bias"
+            )
         self.pruned_heads = set()
 
     def prune_heads(self, heads):
         raise NotImplementedError
 
     @staticmethod
-    def _relative_position_bucket(relative_position,
-                                  bidirectional=True,
-                                  num_buckets=32,
-                                  max_distance=128):
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
         """
         Adapted from Mesh Tensorflow:
         https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
@@ -170,7 +164,10 @@ class TFT5Attention(tf.keras.layers.Layer):
         is_small = tf.math.less(n, max_exact)
         val_if_large = max_exact + tf.dtypes.cast(
             tf.math.log(tf.dtypes.cast(n, tf.float32) / max_exact)
-            / math.log(max_distance / max_exact) * (num_buckets - max_exact), tf.int32)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact),
+            tf.int32,
+        )
         val_if_large = tf.math.minimum(val_if_large, num_buckets - 1)
         ret += tf.where(is_small, n, val_if_large)
         return ret
@@ -180,11 +177,11 @@ class TFT5Attention(tf.keras.layers.Layer):
         context_position = tf.range(qlen)[:, None]
         memory_position = tf.range(klen)[None, :]
         relative_position = memory_position - context_position  # shape (qlen, klen)
-        rp_bucket = self._relative_position_bucket(relative_position,
-                                                   bidirectional=not self.is_decoder,
-                                                   num_buckets=self.relative_attention_num_buckets)
+        rp_bucket = self._relative_position_bucket(
+            relative_position, bidirectional=not self.is_decoder, num_buckets=self.relative_attention_num_buckets
+        )
         values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
-        values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0) # shape (1, num_heads, qlen, klen)
+        values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0)  # shape (1, num_heads, qlen, klen)
         return values
 
     def call(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None, training=False):
@@ -195,7 +192,7 @@ class TFT5Attention(tf.keras.layers.Layer):
         # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
         bs, qlen, dim = shape_list(input)
         if kv is None:
-            klen = qlen if cache is None else cache['slen'] + qlen
+            klen = qlen if cache is None else cache["slen"] + qlen
         else:
             klen = shape_list(kv)[1]
 
@@ -207,28 +204,28 @@ class TFT5Attention(tf.keras.layers.Layer):
             """  compute context """
             return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.inner_dim))
 
-        q = shape(self.q(input))                                          # (bs, n_heads, qlen, dim_per_head)
+        q = shape(self.q(input))  # (bs, n_heads, qlen, dim_per_head)
         if kv is None:
-            k = shape(self.k(input))                                      # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v(input))                                      # (bs, n_heads, qlen, dim_per_head)
+            k = shape(self.k(input))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(input))  # (bs, n_heads, qlen, dim_per_head)
         elif cache is None or self.layer_id not in cache:
             k = v = kv
-            k = shape(self.k(k))                                          # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v(v))                                          # (bs, n_heads, qlen, dim_per_head)
+            k = shape(self.k(k))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(v))  # (bs, n_heads, qlen, dim_per_head)
 
         if cache is not None:
             if self.layer_id in cache:
                 if kv is None:
                     k_, v_ = cache[self.layer_id]
-                    k = tf.concat([k_, k], axis=2)                             # (bs, n_heads, klen, dim_per_head)
-                    v = tf.concat([v_, v], axis=2)                             # (bs, n_heads, klen, dim_per_head)
+                    k = tf.concat([k_, k], axis=2)  # (bs, n_heads, klen, dim_per_head)
+                    v = tf.concat([v_, v], axis=2)  # (bs, n_heads, klen, dim_per_head)
                 else:
                     k, v = cache[self.layer_id]
             cache[self.layer_id] = (k, v)
 
         # q = q / math.sqrt(dim_per_head)                                     # No scaling in T5
         # scores = tf.matmul(q, k, transpose_b=True)                            # (bs, n_heads, qlen, klen)
-        scores = tf.einsum('bnqd,bnkd->bnqk', q, k)                        # (bs, n_heads, qlen, klen)
+        scores = tf.einsum("bnqd,bnkd->bnqk", q, k)  # (bs, n_heads, qlen, klen)
 
         if position_bias is None:
             if not self.has_relative_attention_bias:
@@ -240,15 +237,15 @@ class TFT5Attention(tf.keras.layers.Layer):
                 # scores.masked_fill_(mask, -float('inf'))                          # (bs, n_heads, qlen, klen)
 
         scores += position_bias
-        weights = tf.nn.softmax(scores, axis=-1)                              # (bs, n_heads, qlen, klen)
-        weights = self.dropout(weights, training=training)                    # (bs, n_heads, qlen, klen)
+        weights = tf.nn.softmax(scores, axis=-1)  # (bs, n_heads, qlen, klen)
+        weights = self.dropout(weights, training=training)  # (bs, n_heads, qlen, klen)
 
         # Mask heads if we want to
         if head_mask is not None:
             weights = weights * head_mask
 
-        context = tf.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
-        context = unshape(context)                                            # (bs, qlen, dim)
+        context = tf.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)  # (bs, qlen, dim)
 
         context = self.o(context)
 
@@ -263,21 +260,17 @@ class TFT5Attention(tf.keras.layers.Layer):
 class TFT5LayerSelfAttention(tf.keras.layers.Layer):
     def __init__(self, config, has_relative_attention_bias=False, **kwargs):
         super(TFT5LayerSelfAttention, self).__init__(**kwargs)
-        self.SelfAttention = TFT5Attention(config,
-                                           has_relative_attention_bias=has_relative_attention_bias,
-                                           name='SelfAttention')
-        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
-                                        name='layer_norm')
+        self.SelfAttention = TFT5Attention(
+            config, has_relative_attention_bias=has_relative_attention_bias, name="SelfAttention"
+        )
+        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
 
-    def call(self, hidden_states, attention_mask=None, position_bias=None,
-             head_mask=None, training=False):
+    def call(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None, training=False):
         norm_x = self.layer_norm(hidden_states)
-        attention_output = self.SelfAttention(norm_x,
-                                              mask=attention_mask,
-                                              position_bias=position_bias,
-                                              head_mask=head_mask,
-                                              training=training)
+        attention_output = self.SelfAttention(
+            norm_x, mask=attention_mask, position_bias=position_bias, head_mask=head_mask, training=training
+        )
         y = attention_output[0]
         layer_output = hidden_states + self.dropout(y, training=training)
         outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
@@ -287,22 +280,17 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer):
 class TFT5LayerCrossAttention(tf.keras.layers.Layer):
     def __init__(self, config, has_relative_attention_bias=False, **kwargs):
         super(TFT5LayerCrossAttention, self).__init__(**kwargs)
-        self.EncDecAttention = TFT5Attention(config,
-                                           has_relative_attention_bias=has_relative_attention_bias,
-                                           name='EncDecAttention')
-        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
-                                        name='layer_norm')
+        self.EncDecAttention = TFT5Attention(
+            config, has_relative_attention_bias=has_relative_attention_bias, name="EncDecAttention"
+        )
+        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
 
-    def call(self, hidden_states, kv, attention_mask=None, position_bias=None,
-             head_mask=None, training=False):
+    def call(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None, training=False):
         norm_x = self.layer_norm(hidden_states)
-        attention_output = self.EncDecAttention(norm_x,
-                                                mask=attention_mask,
-                                                kv=kv,
-                                                position_bias=position_bias,
-                                                head_mask=head_mask,
-                                                training=training)
+        attention_output = self.EncDecAttention(
+            norm_x, mask=attention_mask, kv=kv, position_bias=position_bias, head_mask=head_mask, training=training
+        )
         y = attention_output[0]
         layer_output = hidden_states + self.dropout(y, training=training)
         outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
@@ -314,43 +302,57 @@ class TFT5Block(tf.keras.layers.Layer):
         super(TFT5Block, self).__init__(**kwargs)
         self.is_decoder = config.is_decoder
         self.layer = []
-        self.layer.append(TFT5LayerSelfAttention(config,
-                                                 has_relative_attention_bias=has_relative_attention_bias,
-                                                 name='layer_._0'))
+        self.layer.append(
+            TFT5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._0")
+        )
         if self.is_decoder:
-            self.layer.append(TFT5LayerCrossAttention(config,
-                                                      has_relative_attention_bias=has_relative_attention_bias,
-                                                      name='layer_._1'))
-            self.layer.append(TFT5LayerFF(config, name='layer_._2'))
+            self.layer.append(
+                TFT5LayerCrossAttention(
+                    config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._1"
+                )
+            )
+            self.layer.append(TFT5LayerFF(config, name="layer_._2"))
         else:
-            self.layer.append(TFT5LayerFF(config, name='layer_._1'))
+            self.layer.append(TFT5LayerFF(config, name="layer_._1"))
 
-    def call(self, hidden_states, attention_mask=None, position_bias=None,
-             encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None,
-             head_mask=None, training=False):
-        self_attention_outputs = self.layer[0](hidden_states,
-                                                attention_mask=attention_mask,
-                                                position_bias=position_bias,
-                                                head_mask=head_mask,
-                                                training=training)
+    def call(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        head_mask=None,
+        training=False,
+    ):
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            head_mask=head_mask,
+            training=training,
+        )
         hidden_states = self_attention_outputs[0]
         outputs = self_attention_outputs[1:]
 
         if not self.is_decoder:
             hidden_states = self.layer[1](hidden_states, training=training)
         else:
-            cross_attention_outputs = self.layer[1](hidden_states,
-                                                    kv=encoder_hidden_states,
-                                                    attention_mask=encoder_attention_mask,
-                                                    position_bias=encoder_decoder_position_bias,
-                                                    head_mask=head_mask,
-                                                    training=training)
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                kv=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                head_mask=head_mask,
+                training=training,
+            )
             hidden_states = cross_attention_outputs[0]
             outputs = outputs + cross_attention_outputs[1:]
             hidden_states = self.layer[2](hidden_states, training=training)
 
         outputs = (hidden_states,) + outputs  # add attentions if we output them
-        return outputs # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
+        return outputs  # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
 
 
 ####################################################
@@ -366,12 +368,11 @@ class TFT5MainLayer(tf.keras.layers.Layer):
         self.config = config
         self.num_hidden_layers = config.num_layers
 
-        self.block = [TFT5Block(config,
-                                has_relative_attention_bias=bool(i == 0),
-                                name='block_._{}'.format(i))
-                        for i in range(config.num_layers)]
-        self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
-                                              name='final_layer_norm')
+        self.block = [
+            TFT5Block(config, has_relative_attention_bias=bool(i == 0), name="block_._{}".format(i))
+            for i in range(config.num_layers)
+        ]
+        self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="final_layer_norm")
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
 
     def _resize_token_embeddings(self, new_num_tokens):
@@ -380,8 +381,15 @@ class TFT5MainLayer(tf.keras.layers.Layer):
     def _prune_heads(self, heads_to_prune):
         raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
 
-    def call(self, hidden_states, attention_mask=None, encoder_hidden_states=None,
-             encoder_attention_mask=None, head_mask=None, training=False):
+    def call(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        training=False,
+    ):
 
         batch_size, seq_length = shape_list(hidden_states)[:2]
         if attention_mask is None:
@@ -397,13 +405,14 @@ class TFT5MainLayer(tf.keras.layers.Layer):
         if num_dims_attention_mask == 3:
             extended_attention_mask = attention_mask[:, None, :, :]
         elif num_dims_attention_mask == 2:
-        # Provided a padding mask of dimensions [batch_size, seq_length]
-        # - if the model is a decoder, apply a causal mask in addition to the padding mask
-        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
             if self.config.is_decoder:
                 seq_ids = tf.range(seq_length)
-                causal_mask = tf.less_equal(tf.tile(seq_ids[None, None, :], (batch_size, seq_length, 1)),
-                                            seq_ids[None, :, None])
+                causal_mask = tf.less_equal(
+                    tf.tile(seq_ids[None, None, :], (batch_size, seq_length, 1)), seq_ids[None, :, None]
+                )
                 causal_mask = tf.cast(causal_mask, dtype=tf.float32)
                 extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
             else:
@@ -460,14 +469,16 @@ class TFT5MainLayer(tf.keras.layers.Layer):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
-            layer_outputs = layer_module(hidden_states,
-                                         attention_mask=extended_attention_mask,
-                                         position_bias=position_bias,
-                                         encoder_hidden_states=encoder_hidden_states,
-                                         encoder_attention_mask=encoder_extended_attention_mask,
-                                         encoder_decoder_position_bias=encoder_decoder_position_bias,
-                                         head_mask=head_mask[i],
-                                         training=training)
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask=extended_attention_mask,
+                position_bias=position_bias,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_extended_attention_mask,
+                encoder_decoder_position_bias=encoder_decoder_position_bias,
+                head_mask=head_mask[i],
+                training=training,
+            )
             hidden_states = layer_outputs[0]
             if i == 0:
                 # We share the position biases between the layers - the first layer store them
@@ -505,6 +516,7 @@ class TFT5PreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = T5Config
     pretrained_model_archive_map = TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
@@ -513,9 +525,11 @@ class TFT5PreTrainedModel(TFPreTrainedModel):
     def dummy_inputs(self):
         input_ids = tf.constant(DUMMY_INPUTS)
         input_mask = tf.constant(DUMMY_MASK)
-        dummy_inputs = {'decoder_input_ids': input_ids,
-                        'encoder_input_ids': input_ids,
-                        'decoder_attention_mask': input_mask}
+        dummy_inputs = {
+            "decoder_input_ids": input_ids,
+            "encoder_input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
         return dummy_inputs
 
 
@@ -586,9 +600,12 @@ T5_INPUTS_DOCSTRING = r"""
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
 
-@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states"
-                      "without any specific head on top.",
-                      T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.",
+    T5_START_DOCSTRING,
+    T5_INPUTS_DOCSTRING,
+)
 class TFT5Model(TFT5PreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -614,17 +631,17 @@ class TFT5Model(TFT5PreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFT5Model, self).__init__(config, *inputs, **kwargs)
-        self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model,
-                                         name='shared')
+        self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared")
 
         encoder_config = copy.deepcopy(config)
-        self.encoder = TFT5MainLayer(encoder_config, name='encoder')
+        self.encoder = TFT5MainLayer(encoder_config, name="encoder")
 
         decoder_config = copy.deepcopy(config)
         decoder_config.is_decoder = True
-        self.decoder = TFT5MainLayer(decoder_config, name='decoder')
+        self.decoder = TFT5MainLayer(decoder_config, name="decoder")
 
     def get_input_embeddings(self):
         return self.shared
@@ -641,14 +658,15 @@ class TFT5Model(TFT5PreTrainedModel):
         if isinstance(decoder_input_ids, dict):
             kwargs.update(decoder_input_ids)
         else:
-            kwargs['decoder_input_ids'] = decoder_input_ids
+            kwargs["decoder_input_ids"] = decoder_input_ids
 
-        kwargs_common = dict((k, v) for k, v in kwargs.items()
-                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_common = dict(
+            (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")
+        )
         kwargs_encoder = kwargs_common.copy()
         kwargs_decoder = kwargs_common.copy()
-        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
-        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+        kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))
 
         # Encode if needed (training, first prediction pass)
         encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
@@ -678,8 +696,7 @@ class TFT5Model(TFT5PreTrainedModel):
         return decoder_outputs + encoder_outputs
 
 
-@add_start_docstrings("""T5 Model with a `language modeling` head on top. """,
-    T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
 class TFT5WithLMHeadModel(TFT5PreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -705,19 +722,19 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel):
         prediction_scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFT5WithLMHeadModel, self).__init__(config, *inputs, **kwargs)
         self.model_dim = config.d_model
 
-        self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model,
-                                         name='shared')
+        self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared")
 
         encoder_config = copy.deepcopy(config)
-        self.encoder = TFT5MainLayer(encoder_config, name='encoder')
+        self.encoder = TFT5MainLayer(encoder_config, name="encoder")
 
         decoder_config = copy.deepcopy(config)
         decoder_config.is_decoder = True
-        self.decoder = TFT5MainLayer(decoder_config, name='decoder')
+        self.decoder = TFT5MainLayer(decoder_config, name="decoder")
 
     def get_input_embeddings(self):
         return self.shared
@@ -734,14 +751,15 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel):
         if isinstance(decoder_input_ids, dict):
             kwargs.update(decoder_input_ids)
         else:
-            kwargs['decoder_input_ids'] = decoder_input_ids
+            kwargs["decoder_input_ids"] = decoder_input_ids
 
-        kwargs_common = dict((k, v) for k, v in kwargs.items()
-                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_common = dict(
+            (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")
+        )
         kwargs_encoder = kwargs_common.copy()
         kwargs_decoder = kwargs_common.copy()
-        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
-        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+        kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))
 
         # Encode if needed (training, first prediction pass)
         encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
diff --git a/transformers/modeling_tf_transfo_xl.py b/transformers/modeling_tf_transfo_xl.py
index 08bbe74032..fc7ea932aa 100644
--- a/transformers/modeling_tf_transfo_xl.py
+++ b/transformers/modeling_tf_transfo_xl.py
@@ -37,9 +37,10 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-tf_model.h5",
+    "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-tf_model.h5",
 }
 
+
 class TFPositionalEmbedding(tf.keras.layers.Layer):
     def __init__(self, demb, **kwargs):
         super(TFPositionalEmbedding, self).__init__(**kwargs)
@@ -47,7 +48,7 @@ class TFPositionalEmbedding(tf.keras.layers.Layer):
         self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb))
 
     def call(self, pos_seq, bsz=None):
-        sinusoid_inp = tf.einsum('i,j->ij', pos_seq, self.inv_freq)
+        sinusoid_inp = tf.einsum("i,j->ij", pos_seq, self.inv_freq)
         pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
 
         if bsz is not None:
@@ -64,17 +65,14 @@ class TFPositionwiseFF(tf.keras.layers.Layer):
         self.d_inner = d_inner
         self.dropout = dropout
 
-        self.layer_1 = tf.keras.layers.Dense(d_inner,
-                                             kernel_initializer=get_initializer(init_std),
-                                             activation=tf.nn.relu,
-                                             name='CoreNet_._0')
+        self.layer_1 = tf.keras.layers.Dense(
+            d_inner, kernel_initializer=get_initializer(init_std), activation=tf.nn.relu, name="CoreNet_._0"
+        )
         self.drop_1 = tf.keras.layers.Dropout(dropout)
-        self.layer_2 = tf.keras.layers.Dense(d_model,
-                                             kernel_initializer=get_initializer(init_std),
-                                             name='CoreNet_._3')
+        self.layer_2 = tf.keras.layers.Dense(d_model, kernel_initializer=get_initializer(init_std), name="CoreNet_._3")
         self.drop_2 = tf.keras.layers.Dropout(dropout)
 
-        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm')
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm")
 
         self.pre_lnorm = pre_lnorm
 
@@ -103,10 +101,24 @@ class TFPositionwiseFF(tf.keras.layers.Layer):
 
 
 class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
-    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
-                 tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False,
-                 r_r_bias=None, r_w_bias=None, output_attentions=False, 
-                 layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
+    def __init__(
+        self,
+        n_head,
+        d_model,
+        d_head,
+        dropout,
+        dropatt=0,
+        tgt_len=None,
+        ext_len=None,
+        mem_len=None,
+        pre_lnorm=False,
+        r_r_bias=None,
+        r_w_bias=None,
+        output_attentions=False,
+        layer_norm_epsilon=1e-5,
+        init_std=0.02,
+        **kwargs
+    ):
         super(TFRelPartialLearnableMultiHeadAttn, self).__init__(**kwargs)
 
         self.output_attentions = output_attentions
@@ -115,46 +127,41 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
         self.d_head = d_head
         self.dropout = dropout
 
-        self.qkv_net = tf.keras.layers.Dense(3 * n_head * d_head,
-                                             kernel_initializer=get_initializer(init_std),
-                                             use_bias=False,
-                                             name='qkv_net')
+        self.qkv_net = tf.keras.layers.Dense(
+            3 * n_head * d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="qkv_net"
+        )
 
         self.drop = tf.keras.layers.Dropout(dropout)
         self.dropatt = tf.keras.layers.Dropout(dropatt)
-        self.o_net = tf.keras.layers.Dense(d_model,
-                                           kernel_initializer=get_initializer(init_std),
-                                           use_bias=False,
-                                           name='o_net')
+        self.o_net = tf.keras.layers.Dense(
+            d_model, kernel_initializer=get_initializer(init_std), use_bias=False, name="o_net"
+        )
 
-        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm')
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm")
 
         self.scale = 1 / (d_head ** 0.5)
 
         self.pre_lnorm = pre_lnorm
 
-        if r_r_bias is not None and r_w_bias is not None: # Biases are shared
+        if r_r_bias is not None and r_w_bias is not None:  # Biases are shared
             self.r_r_bias = r_r_bias
             self.r_w_bias = r_w_bias
         else:
             self.r_r_bias = None
             self.r_w_bias = None
 
-        self.r_net = tf.keras.layers.Dense(self.n_head * self.d_head,
-                                           kernel_initializer=get_initializer(init_std),
-                                           use_bias=False,
-                                           name='r_net')
+        self.r_net = tf.keras.layers.Dense(
+            self.n_head * self.d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="r_net"
+        )
 
     def build(self, input_shape):
-        if self.r_r_bias is None or self.r_w_bias is None: # Biases are not shared
-            self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
-                                            initializer='zeros',
-                                            trainable=True,
-                                            name='r_r_bias')
-            self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
-                                            initializer='zeros',
-                                            trainable=True,
-                                            name='r_w_bias')
+        if self.r_r_bias is None or self.r_w_bias is None:  # Biases are not shared
+            self.r_r_bias = self.add_weight(
+                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
+            )
+            self.r_w_bias = self.add_weight(
+                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
+            )
         super(TFRelPartialLearnableMultiHeadAttn, self).build(input_shape)
 
     def _rel_shift(self, x):
@@ -196,14 +203,14 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
         w_head_k = tf.reshape(w_head_k, (klen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
         w_head_v = tf.reshape(w_head_v, (klen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
 
-        r_head_k = tf.reshape(r_head_k, (rlen, self.n_head, self.d_head))       # qlen x n_head x d_head
+        r_head_k = tf.reshape(r_head_k, (rlen, self.n_head, self.d_head))  # qlen x n_head x d_head
 
         #### compute attention score
-        rw_head_q = w_head_q + self.r_w_bias                                    # qlen x bsz x n_head x d_head
-        AC = tf.einsum('ibnd,jbnd->ijbn', rw_head_q, w_head_k)                  # qlen x klen x bsz x n_head
+        rw_head_q = w_head_q + self.r_w_bias  # qlen x bsz x n_head x d_head
+        AC = tf.einsum("ibnd,jbnd->ijbn", rw_head_q, w_head_k)  # qlen x klen x bsz x n_head
 
         rr_head_q = w_head_q + self.r_r_bias
-        BD = tf.einsum('ibnd,jnd->ijbn', rr_head_q, r_head_k)                   # qlen x klen x bsz x n_head
+        BD = tf.einsum("ibnd,jnd->ijbn", rr_head_q, r_head_k)  # qlen x klen x bsz x n_head
         BD = self._rel_shift(BD)
 
         # [qlen x klen x bsz x n_head]
@@ -224,12 +231,11 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
             attn_prob = attn_prob * head_mask
 
         #### compute attention vector
-        attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, w_head_v)
+        attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, w_head_v)
 
         # [qlen x bsz x n_head x d_head]
         attn_vec_sizes = shape_list(attn_vec)
-        attn_vec = tf.reshape(attn_vec, 
-                        (attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head))
+        attn_vec = tf.reshape(attn_vec, (attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head))
 
         ##### linear projection
         attn_out = self.o_net(attn_vec)
@@ -249,32 +255,57 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
 
 
 class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
-    def __init__(self, n_head, d_model, d_head, d_inner, dropout,
-                 tgt_len=None, ext_len=None, mem_len=None,
-                 dropatt=0., pre_lnorm=False,
-                 r_w_bias=None,
-                 r_r_bias=None,
-                 output_attentions=False,
-                 layer_norm_epsilon=1e-5,
-                 init_std=0.02,
-                 **kwargs):
+    def __init__(
+        self,
+        n_head,
+        d_model,
+        d_head,
+        d_inner,
+        dropout,
+        tgt_len=None,
+        ext_len=None,
+        mem_len=None,
+        dropatt=0.0,
+        pre_lnorm=False,
+        r_w_bias=None,
+        r_r_bias=None,
+        output_attentions=False,
+        layer_norm_epsilon=1e-5,
+        init_std=0.02,
+        **kwargs
+    ):
         super(TFRelPartialLearnableDecoderLayer, self).__init__(**kwargs)
 
-        self.dec_attn = TFRelPartialLearnableMultiHeadAttn(n_head, d_model,
-                            d_head, dropout, tgt_len=tgt_len, ext_len=ext_len,
-                            mem_len=mem_len, dropatt=dropatt, pre_lnorm=pre_lnorm,
-                            r_w_bias=r_w_bias, r_r_bias=r_r_bias, init_std=init_std,
-                            output_attentions=output_attentions,
-                            layer_norm_epsilon=layer_norm_epsilon, name='dec_attn')
-        self.pos_ff = TFPositionwiseFF(d_model, d_inner, dropout, 
-                                       pre_lnorm=pre_lnorm, init_std=init_std,
-                                       layer_norm_epsilon=layer_norm_epsilon,
-                                       name='pos_ff')
+        self.dec_attn = TFRelPartialLearnableMultiHeadAttn(
+            n_head,
+            d_model,
+            d_head,
+            dropout,
+            tgt_len=tgt_len,
+            ext_len=ext_len,
+            mem_len=mem_len,
+            dropatt=dropatt,
+            pre_lnorm=pre_lnorm,
+            r_w_bias=r_w_bias,
+            r_r_bias=r_r_bias,
+            init_std=init_std,
+            output_attentions=output_attentions,
+            layer_norm_epsilon=layer_norm_epsilon,
+            name="dec_attn",
+        )
+        self.pos_ff = TFPositionwiseFF(
+            d_model,
+            d_inner,
+            dropout,
+            pre_lnorm=pre_lnorm,
+            init_std=init_std,
+            layer_norm_epsilon=layer_norm_epsilon,
+            name="pos_ff",
+        )
 
     def call(self, inputs, training=False):
         dec_inp, r, dec_attn_mask, mems, head_mask = inputs
-        attn_outputs = self.dec_attn([dec_inp, r, dec_attn_mask,
-                                      mems, head_mask], training=training)
+        attn_outputs = self.dec_attn([dec_inp, r, dec_attn_mask, mems, head_mask], training=training)
         ff_output = self.pos_ff(attn_outputs[0], training=training)
 
         outputs = [ff_output] + attn_outputs[1:]
@@ -283,8 +314,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
 
 
 class TFAdaptiveEmbedding(tf.keras.layers.Layer):
-    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02,
-                 sample_softmax=False, **kwargs):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs):
         super(TFAdaptiveEmbedding, self).__init__(**kwargs)
 
         self.n_token = n_token
@@ -305,20 +335,28 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
             raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
         else:
             for i in range(len(self.cutoffs)):
-                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
                 d_emb_i = d_embed // (div_val ** i)
-                self.emb_layers.append(tf.keras.layers.Embedding(r_idx-l_idx,
-                                                                 d_emb_i,
-                                                                 embeddings_initializer=get_initializer(init_std),
-                                                                 name='emb_layers_._{}'.format(i)))
+                self.emb_layers.append(
+                    tf.keras.layers.Embedding(
+                        r_idx - l_idx,
+                        d_emb_i,
+                        embeddings_initializer=get_initializer(init_std),
+                        name="emb_layers_._{}".format(i),
+                    )
+                )
 
     def build(self, input_shape):
         for i in range(len(self.cutoffs)):
             d_emb_i = self.d_embed // (self.div_val ** i)
-            self.emb_projs.append(self.add_weight(shape=(d_emb_i, self.d_proj),
-                                                  initializer=get_initializer(self.init_std),
-                                                  trainable=True,
-                                                  name='emb_projs_._{}'.format(i)))
+            self.emb_projs.append(
+                self.add_weight(
+                    shape=(d_emb_i, self.d_proj),
+                    initializer=get_initializer(self.init_std),
+                    trainable=True,
+                    name="emb_projs_._{}".format(i),
+                )
+            )
         super(TFAdaptiveEmbedding, self).build(input_shape)
 
     def call(self, inp):
@@ -334,7 +372,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
 
                 inp_i = tf.boolean_mask(inp_flat, mask_i) - l_idx
                 emb_i = self.emb_layers[i](inp_i)
-                emb_i = tf.einsum('id,de->ie', emb_i, self.emb_projs[i])
+                emb_i = tf.einsum("id,de->ie", emb_i, self.emb_projs[i])
 
                 mask_idx = tf.cast(tf.where(mask_i), dtype=tf.int64)
                 emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(shape_list(emb_flat), dtype=tf.int64))
@@ -361,8 +399,15 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
         self.d_head = config.d_head
         self.untie_r = config.untie_r
 
-        self.word_emb = TFAdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs, 
-                                            div_val=config.div_val, init_std=config.init_std, name='word_emb')
+        self.word_emb = TFAdaptiveEmbedding(
+            config.vocab_size,
+            config.d_embed,
+            config.d_model,
+            config.cutoffs,
+            div_val=config.div_val,
+            init_std=config.init_std,
+            name="word_emb",
+        )
 
         self.drop = tf.keras.layers.Dropout(config.dropout)
 
@@ -376,41 +421,47 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
         self.attn_type = config.attn_type
 
         self.layers = []
-        if config.attn_type == 0: # the default attention
+        if config.attn_type == 0:  # the default attention
             for i in range(config.n_layer):
                 self.layers.append(
                     TFRelPartialLearnableDecoderLayer(
-                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
-                        tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
-                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
+                        config.n_head,
+                        config.d_model,
+                        config.d_head,
+                        config.d_inner,
+                        config.dropout,
+                        tgt_len=config.tgt_len,
+                        ext_len=config.ext_len,
+                        mem_len=config.mem_len,
+                        dropatt=config.dropatt,
+                        pre_lnorm=config.pre_lnorm,
                         r_w_bias=None if self.untie_r else self.r_w_bias,
                         r_r_bias=None if self.untie_r else self.r_r_bias,
                         output_attentions=self.output_attentions,
                         layer_norm_epsilon=config.layer_norm_epsilon,
                         init_std=config.init_std,
-                        name='layers_._{}'.format(i))
+                        name="layers_._{}".format(i),
+                    )
                 )
-        else: # learnable embeddings and absolute embeddings
+        else:  # learnable embeddings and absolute embeddings
             raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
 
         self.same_length = config.same_length
         self.clamp_len = config.clamp_len
 
-        if self.attn_type == 0: # default attention
-            self.pos_emb = TFPositionalEmbedding(self.d_model, name='pos_emb')
-        else: # learnable embeddings and absolute embeddings
+        if self.attn_type == 0:  # default attention
+            self.pos_emb = TFPositionalEmbedding(self.d_model, name="pos_emb")
+        else:  # learnable embeddings and absolute embeddings
             raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
 
     def build(self, input_shape):
         if not self.untie_r:
-            self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
-                                            initializer='zeros',
-                                            trainable=True,
-                                            name='r_w_bias')
-            self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
-                                            initializer='zeros',
-                                            trainable=True,
-                                            name='r_r_bias')
+            self.r_w_bias = self.add_weight(
+                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
+            )
+            self.r_r_bias = self.add_weight(
+                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
+            )
         super(TFTransfoXLMainLayer, self).build(input_shape)
 
     def get_input_embeddings(self):
@@ -443,10 +494,11 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
 
     def _update_mems(self, hids, mems, qlen, mlen):
         # does not deal with None
-        if mems is None: return None
+        if mems is None:
+            return None
 
         # mems is not None
-        assert len(hids) == len(mems), 'len(hids) != len(mems)'
+        assert len(hids) == len(mems), "len(hids) != len(mems)"
 
         # There are `mlen + qlen` steps that can be cached into mems
         # For the next step, the last `ext_len` of the `qlen` tokens
@@ -472,10 +524,10 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
             inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
             assert len(inputs) <= 4, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            mems = inputs.get('mems', mems)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            mems = inputs.get("mems", mems)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 4, "Too many inputs."
         else:
             input_ids = inputs
@@ -521,8 +573,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
         dec_attn_mask = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
         if self.same_length:
             mask_l = tf.linalg.band_part(attn_mask, -1, 0)
-            dec_attn_mask = tf.concat([dec_attn_mask[:, :qlen] + mask_l - mask_dia,
-                                       dec_attn_mask[:, qlen:]], 1)
+            dec_attn_mask = tf.concat([dec_attn_mask[:, :qlen] + mask_l - mask_dia, dec_attn_mask[:, qlen:]], 1)
         # ::: PyTorch masking code for reference :::
         # if self.same_length:
         #     all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8)
@@ -539,8 +590,8 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
 
         hids = []
         attentions = []
-        if self.attn_type == 0: # default
-            pos_seq = tf.range(klen-1, -1, -1.0)
+        if self.attn_type == 0:  # default
+            pos_seq = tf.range(klen - 1, -1, -1.0)
             if self.clamp_len > 0:
                 pos_seq = tf.minimum(pos_seq, self.clamp_len)
             pos_emb = self.pos_emb(pos_seq)
@@ -551,12 +602,11 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
             for i, layer in enumerate(self.layers):
                 hids.append(core_out)
                 mems_i = None if mems is None else mems[i]
-                layer_outputs = layer([core_out, pos_emb, dec_attn_mask,
-                                       mems_i, head_mask[i]], training=training)
+                layer_outputs = layer([core_out, pos_emb, dec_attn_mask, mems_i, head_mask[i]], training=training)
                 core_out = layer_outputs[0]
                 if self.output_attentions:
                     attentions.append(layer_outputs[1])
-        else: # learnable embeddings and absolute embeddings
+        else:  # learnable embeddings and absolute embeddings
             raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
 
         core_out = self.drop(core_out, training=training)
@@ -581,6 +631,7 @@ class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = TransfoXLConfig
     pretrained_model_archive_map = TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
@@ -647,8 +698,12 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
-                      TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+    TRANSFO_XL_START_DOCSTRING,
+    TRANSFO_XL_INPUTS_DOCSTRING,
+)
 class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -678,18 +733,22 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
         last_hidden_states, mems = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFTransfoXLModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFTransfoXLMainLayer(config, name='transformer')
+        self.transformer = TFTransfoXLMainLayer(config, name="transformer")
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
         return outputs
 
 
-@add_start_docstrings("""The Transformer-XL Model with a language modeling head on top
+@add_start_docstrings(
+    """The Transformer-XL Model with a language modeling head on top
     (adaptive softmax with weights tied to the adaptive input embeddings)""",
-    TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
+    TRANSFO_XL_START_DOCSTRING,
+    TRANSFO_XL_INPUTS_DOCSTRING,
+)
 class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -720,17 +779,19 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
         prediction_scores, mems = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(TFTransfoXLLMHeadModel, self).__init__(config)
-        self.transformer = TFTransfoXLMainLayer(config, name='transformer')
+        self.transformer = TFTransfoXLMainLayer(config, name="transformer")
         self.sample_softmax = config.sample_softmax
         # use sampled softmax
         if config.sample_softmax > 0:
             raise NotImplementedError
         # use adaptive softmax (including standard softmax)
         else:
-            self.crit = TFAdaptiveSoftmaxMask(config.vocab_size, config.d_embed, config.d_model, 
-                                              config.cutoffs, div_val=config.div_val, name='crit')
+            self.crit = TFAdaptiveSoftmaxMask(
+                config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val, name="crit"
+            )
 
     def reset_length(self, tgt_len, ext_len, mem_len):
         self.transformer.reset_length(tgt_len, ext_len, mem_len)
@@ -747,11 +808,11 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
             labels = inputs[4] if len(inputs) > 4 else labels
             assert len(inputs) <= 5, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            mems = inputs.get('mems', mems)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
-            labels = inputs.get('labels', labels)
+            input_ids = inputs.get("input_ids")
+            mems = inputs.get("mems", mems)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            labels = inputs.get("labels", labels)
             assert len(inputs) <= 5, "Too many inputs."
         else:
             input_ids = inputs
diff --git a/transformers/modeling_tf_transfo_xl_utilities.py b/transformers/modeling_tf_transfo_xl_utilities.py
index f730af851f..0f2a4ebeba 100644
--- a/transformers/modeling_tf_transfo_xl_utilities.py
+++ b/transformers/modeling_tf_transfo_xl_utilities.py
@@ -24,9 +24,9 @@ import tensorflow as tf
 
 from .modeling_tf_utils import shape_list
 
+
 class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
-    def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1,
-                 keep_order=False, **kwargs):
+    def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs):
         super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs)
 
         self.vocab_size = vocab_size
@@ -47,52 +47,59 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
 
     def build(self, input_shape):
         if self.n_clusters > 0:
-            self.cluster_weight = self.add_weight(shape=(self.n_clusters, self.d_embed),
-                                                  initializer='zeros',
-                                                  trainable=True,
-                                                  name='cluster_weight')
-            self.cluster_bias = self.add_weight(shape=(self.n_clusters,),
-                                                initializer='zeros',
-                                                trainable=True,
-                                                name='cluster_bias')
+            self.cluster_weight = self.add_weight(
+                shape=(self.n_clusters, self.d_embed), initializer="zeros", trainable=True, name="cluster_weight"
+            )
+            self.cluster_bias = self.add_weight(
+                shape=(self.n_clusters,), initializer="zeros", trainable=True, name="cluster_bias"
+            )
 
         if self.div_val == 1:
             for i in range(len(self.cutoffs)):
                 if self.d_proj != self.d_embed:
-                    weight = self.add_weight(shape=(self.d_embed, self.d_proj),
-                                             initializer='zeros',
-                                             trainable=True,
-                                             name='out_projs_._{}'.format(i))
+                    weight = self.add_weight(
+                        shape=(self.d_embed, self.d_proj),
+                        initializer="zeros",
+                        trainable=True,
+                        name="out_projs_._{}".format(i),
+                    )
                     self.out_projs.append(weight)
                 else:
                     self.out_projs.append(None)
-                weight = self.add_weight(shape=(self.vocab_size, self.d_embed,),
-                                         initializer='zeros',
-                                         trainable=True,
-                                         name='out_layers_._{}_._weight'.format(i))
-                bias = self.add_weight(shape=(self.vocab_size,),
-                                         initializer='zeros',
-                                         trainable=True,
-                                         name='out_layers_._{}_._bias'.format(i))
+                weight = self.add_weight(
+                    shape=(self.vocab_size, self.d_embed,),
+                    initializer="zeros",
+                    trainable=True,
+                    name="out_layers_._{}_._weight".format(i),
+                )
+                bias = self.add_weight(
+                    shape=(self.vocab_size,),
+                    initializer="zeros",
+                    trainable=True,
+                    name="out_layers_._{}_._bias".format(i),
+                )
                 self.out_layers.append((weight, bias))
         else:
             for i in range(len(self.cutoffs)):
-                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
                 d_emb_i = self.d_embed // (self.div_val ** i)
 
-                weight = self.add_weight(shape=(d_emb_i, self.d_proj),
-                                         initializer='zeros',
-                                         trainable=True,
-                                         name='out_projs_._{}'.format(i))
+                weight = self.add_weight(
+                    shape=(d_emb_i, self.d_proj), initializer="zeros", trainable=True, name="out_projs_._{}".format(i)
+                )
                 self.out_projs.append(weight)
-                weight = self.add_weight(shape=(r_idx-l_idx, d_emb_i,),
-                                         initializer='zeros',
-                                         trainable=True,
-                                         name='out_layers_._{}_._weight'.format(i))
-                bias = self.add_weight(shape=(r_idx-l_idx,),
-                                         initializer='zeros',
-                                         trainable=True,
-                                         name='out_layers_._{}_._bias'.format(i))
+                weight = self.add_weight(
+                    shape=(r_idx - l_idx, d_emb_i,),
+                    initializer="zeros",
+                    trainable=True,
+                    name="out_layers_._{}_._weight".format(i),
+                )
+                bias = self.add_weight(
+                    shape=(r_idx - l_idx,),
+                    initializer="zeros",
+                    trainable=True,
+                    name="out_layers_._{}_._bias".format(i),
+                )
                 self.out_layers.append((weight, bias))
         super(TFAdaptiveSoftmaxMask, self).build(input_shape)
 
@@ -100,8 +107,8 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
     def _logit(x, W, b, proj=None):
         y = x
         if proj is not None:
-            y = tf.einsum('ibd,ed->ibe', y, proj)
-        return tf.einsum('ibd,nd->ibn', y, W) + b
+            y = tf.einsum("ibd,ed->ibe", y, proj)
+        return tf.einsum("ibd,nd->ibn", y, W) + b
 
     @staticmethod
     def _gather_logprob(logprob, target):
@@ -114,7 +121,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
         hidden, target = inputs
         head_logprob = 0
         if self.n_clusters == 0:
-            softmax_b = tf.get_variable('bias', [self.config.vocab_size], initializer=tf.zeros_initializer())
+            softmax_b = tf.get_variable("bias", [self.config.vocab_size], initializer=tf.zeros_initializer())
             output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
             if target is not None:
                 loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
@@ -143,7 +150,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
 
                     head_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[0])
                     head_logprob = tf.nn.log_softmax(head_logit)
-                    out.append(head_logprob[..., :self.cutoffs[0]])
+                    out.append(head_logprob[..., : self.cutoffs[0]])
                     if target is not None:
                         cur_head_logprob = tf.boolean_mask(head_logprob, mask)
                         cur_logprob = self._gather_logprob(cur_head_logprob, cur_target)
@@ -170,6 +177,6 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
 
             # Log the loss as a metric (we could log arbitrary metrics,
             # including different metrics for training and inference.
-            self.add_metric(loss, name=self.name, aggregation='mean' if return_mean else '')
+            self.add_metric(loss, name=self.name, aggregation="mean" if return_mean else "")
 
         return out
diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py
index 0aa65a9f17..7ecd79afd3 100644
--- a/transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """TF general model utils."""
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 import os
@@ -26,12 +25,20 @@ from tensorflow.python.keras.saving import hdf5_format
 import h5py
 
 from .configuration_utils import PretrainedConfig
-from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, DUMMY_INPUTS,
-                         cached_path, hf_bucket_url, is_remote_url)
+from .file_utils import (
+    TF2_WEIGHTS_NAME,
+    TF_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    DUMMY_INPUTS,
+    cached_path,
+    hf_bucket_url,
+    is_remote_url,
+)
 from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 
 logger = logging.getLogger(__name__)
 
+
 class TFPreTrainedModel(tf.keras.Model):
     r""" Base class for all TF models.
 
@@ -60,7 +67,7 @@ class TFPreTrainedModel(tf.keras.Model):
         Returns:
             tf.Tensor with dummy inputs
         """
-        return {'input_ids': tf.constant(DUMMY_INPUTS)}
+        return {"input_ids": tf.constant(DUMMY_INPUTS)}
 
     def __init__(self, config, *inputs, **kwargs):
         super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
@@ -70,7 +77,8 @@ class TFPreTrainedModel(tf.keras.Model):
                 "To create a model from a pretrained model use "
                 "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
                     self.__class__.__name__, self.__class__.__name__
-                ))
+                )
+            )
         # Save config in model
         self.config = config
 
@@ -151,7 +159,9 @@ class TFPreTrainedModel(tf.keras.Model):
         """ Save a model and its configuration file to a directory, so that it
             can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
         """
-        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
+        assert os.path.isdir(
+            save_directory
+        ), "Saving path should be a directory where the model and configuration can be saved"
 
         # Save configuration file
         self.config.save_pretrained(save_directory)
@@ -230,20 +240,22 @@ class TFPreTrainedModel(tf.keras.Model):
             model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_pt=True, config=config)
 
         """
-        config = kwargs.pop('config', None)
-        cache_dir = kwargs.pop('cache_dir', None)
-        from_pt = kwargs.pop('from_pt', False)
-        force_download = kwargs.pop('force_download', False)
-        resume_download = kwargs.pop('resume_download', False)
-        proxies = kwargs.pop('proxies', None)
-        output_loading_info = kwargs.pop('output_loading_info', False)
+        config = kwargs.pop("config", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        from_pt = kwargs.pop("from_pt", False)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
 
         # Load config if we don't provide a configuration
         if not isinstance(config, PretrainedConfig):
             config_path = config if config is not None else pretrained_model_name_or_path
             config, model_kwargs = cls.config_class.from_pretrained(
-                config_path, *model_args,
-                cache_dir=cache_dir, return_unused_kwargs=True,
+                config_path,
+                *model_args,
+                cache_dir=cache_dir,
+                return_unused_kwargs=True,
                 force_download=force_download,
                 resume_download=resume_download,
                 **kwargs
@@ -263,9 +275,11 @@ class TFPreTrainedModel(tf.keras.Model):
                     # Load from a PyTorch checkpoint
                     archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
                 else:
-                    raise EnvironmentError("Error no file named {} found in directory {} or `from_pt` set to False".format(
-                        [WEIGHTS_NAME, TF2_WEIGHTS_NAME],
-                        pretrained_model_name_or_path))
+                    raise EnvironmentError(
+                        "Error no file named {} found in directory {} or `from_pt` set to False".format(
+                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME], pretrained_model_name_or_path
+                        )
+                    )
             elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
             elif os.path.isfile(pretrained_model_name_or_path + ".index"):
@@ -273,31 +287,37 @@ class TFPreTrainedModel(tf.keras.Model):
             else:
                 archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=TF2_WEIGHTS_NAME)
                 if from_pt:
-                    raise EnvironmentError("Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name.")
+                    raise EnvironmentError(
+                        "Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name."
+                    )
 
             # redirect to the cache, if necessary
             try:
-                resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download,
-                                                    resume_download=resume_download, proxies=proxies)
+                resolved_archive_file = cached_path(
+                    archive_file,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                )
             except EnvironmentError as e:
                 if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-                    logger.error(
-                        "Couldn't reach server at '{}' to download pretrained weights.".format(
-                            archive_file))
+                    logger.error("Couldn't reach server at '{}' to download pretrained weights.".format(archive_file))
                 else:
                     logger.error(
                         "Model name '{}' was not found in model name list ({}). "
                         "We assumed '{}' was a path or url but couldn't find any file "
                         "associated to this path or url.".format(
                             pretrained_model_name_or_path,
-                            ', '.join(cls.pretrained_model_archive_map.keys()),
-                            archive_file))
+                            ", ".join(cls.pretrained_model_archive_map.keys()),
+                            archive_file,
+                        )
+                    )
                 raise e
             if resolved_archive_file == archive_file:
                 logger.info("loading weights file {}".format(archive_file))
             else:
-                logger.info("loading weights file {} from cache at {}".format(
-                    archive_file, resolved_archive_file))
+                logger.info("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file))
         else:
             resolved_archive_file = None
 
@@ -316,38 +336,42 @@ class TFPreTrainedModel(tf.keras.Model):
         try:
             model.load_weights(resolved_archive_file, by_name=True)
         except OSError:
-            raise OSError("Unable to load weights from h5 file. "
-                          "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. ")
+            raise OSError(
+                "Unable to load weights from h5 file. "
+                "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. "
+            )
 
         ret = model(model.dummy_inputs, training=False)  # Make sure restore ops are run
 
         # Check if the models are the same to output loading informations
-        with h5py.File(resolved_archive_file, 'r') as f:
-            if 'layer_names' not in f.attrs and 'model_weights' in f:
-                f = f['model_weights']
-            hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, 'layer_names'))
+        with h5py.File(resolved_archive_file, "r") as f:
+            if "layer_names" not in f.attrs and "model_weights" in f:
+                f = f["model_weights"]
+            hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, "layer_names"))
         model_layer_names = set(layer.name for layer in model.layers)
         missing_keys = list(model_layer_names - hdf5_layer_names)
         unexpected_keys = list(hdf5_layer_names - model_layer_names)
         error_msgs = []
 
         if len(missing_keys) > 0:
-            logger.info("Layers of {} not initialized from pretrained model: {}".format(
-                model.__class__.__name__, missing_keys))
+            logger.info(
+                "Layers of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys)
+            )
         if len(unexpected_keys) > 0:
-            logger.info("Layers from pretrained model not used in {}: {}".format(
-                model.__class__.__name__, unexpected_keys))
+            logger.info(
+                "Layers from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)
+            )
         if len(error_msgs) > 0:
-            raise RuntimeError('Error(s) in loading weights for {}:\n\t{}'.format(
-                            model.__class__.__name__, "\n\t".join(error_msgs)))
+            raise RuntimeError(
+                "Error(s) in loading weights for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
+            )
         if output_loading_info:
-            loading_info = {"missing_keys": missing_keys,
-                            "unexpected_keys": unexpected_keys,
-                            "error_msgs": error_msgs}
+            loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs}
             return model, loading_info
 
         return model
 
+
 class TFConv1D(tf.keras.layers.Layer):
     def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
         """ TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
@@ -360,13 +384,9 @@ class TFConv1D(tf.keras.layers.Layer):
 
     def build(self, input_shape):
         self.weight = self.add_weight(
-            "weight",
-            shape=[self.nx, self.nf],
-            initializer=get_initializer(self.initializer_range))
-        self.bias = self.add_weight(
-            "bias",
-            shape=[1, self.nf],
-            initializer=tf.zeros_initializer())
+            "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range)
+        )
+        self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer())
 
     def call(self, x):
         bz, sl = shape_list(x)[:2]
@@ -382,11 +402,12 @@ class TFConv1D(tf.keras.layers.Layer):
 class TFSharedEmbeddings(tf.keras.layers.Layer):
     """Construct shared token embeddings.
     """
+
     def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs):
         super(TFSharedEmbeddings, self).__init__(**kwargs)
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
-        self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
+        self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range
 
     def build(self, input_shape):
         """Build shared word embedding layer
@@ -394,9 +415,8 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
             https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
         """
         self.weight = self.add_weight(
-            "weight",
-            shape=[self.vocab_size, self.hidden_size],
-            initializer=get_initializer(self.initializer_range))
+            "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range)
+        )
         super(TFSharedEmbeddings, self).build(input_shape)
 
     def call(self, inputs, mode="embedding"):
@@ -455,35 +475,36 @@ class TFSequenceSummary(tf.keras.layers.Layer):
             summary_first_dropout: Add a dropout before the projection and activation
             summary_last_dropout: Add a dropout after the projection and activation
     """
+
     def __init__(self, config, initializer_range=0.02, **kwargs):
         super(TFSequenceSummary, self).__init__(**kwargs)
 
-        self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
-        if self.summary_type == 'attn':
+        self.summary_type = config.summary_type if hasattr(config, "summary_use_proj") else "last"
+        if self.summary_type == "attn":
             # We should use a standard multi-head attention module with absolute positional embedding for that.
             # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
             # We can probably just use the multi-head attention module of PyTorch >=1.1.0
             raise NotImplementedError
 
-        self.has_summary = hasattr(config, 'summary_use_proj') and config.summary_use_proj
+        self.has_summary = hasattr(config, "summary_use_proj") and config.summary_use_proj
         if self.has_summary:
-            if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
+            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
                 num_classes = config.num_labels
             else:
                 num_classes = config.hidden_size
-            self.summary = tf.keras.layers.Dense(num_classes,
-                                                    kernel_initializer=get_initializer(initializer_range),
-                                                    name='summary')
+            self.summary = tf.keras.layers.Dense(
+                num_classes, kernel_initializer=get_initializer(initializer_range), name="summary"
+            )
 
-        self.has_activation = hasattr(config, 'summary_activation') and config.summary_activation == 'tanh'
+        self.has_activation = hasattr(config, "summary_activation") and config.summary_activation == "tanh"
         if self.has_activation:
             self.activation = tf.keras.activations.tanh
 
-        self.has_first_dropout = hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0
+        self.has_first_dropout = hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0
         if self.has_first_dropout:
             self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout)
 
-        self.has_last_dropout = hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0
+        self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0
         if self.has_last_dropout:
             self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout)
 
@@ -502,29 +523,33 @@ class TFSequenceSummary(tf.keras.layers.Layer):
             cls_index = inputs[1] if len(inputs) > 1 else None
             assert len(inputs) <= 2, "Too many inputs."
         else:
-            input_ids = inputs.get('input_ids')
-            cls_index = inputs.get('cls_index', None)
+            input_ids = inputs.get("input_ids")
+            cls_index = inputs.get("cls_index", None)
 
-        if self.summary_type == 'last':
+        if self.summary_type == "last":
             output = hidden_states[:, -1]
-        elif self.summary_type == 'first':
+        elif self.summary_type == "first":
             output = hidden_states[:, 0]
-        elif self.summary_type == 'mean':
+        elif self.summary_type == "mean":
             output = tf.reduce_mean(hidden_states, axis=1)
-        elif self.summary_type == 'cls_index':
+        elif self.summary_type == "cls_index":
             hidden_shape = shape_list(hidden_states)  # e.g. [batch, num choices, seq length, hidden dims]
             if cls_index is None:
-                cls_index = tf.fill(hidden_shape[:-2], hidden_shape[-2] - 1)  # A tensor full of shape [batch] or [batch, num choices] full of sequence length
+                cls_index = tf.fill(
+                    hidden_shape[:-2], hidden_shape[-2] - 1
+                )  # A tensor full of shape [batch] or [batch, num choices] full of sequence length
             cls_shape = shape_list(cls_index)
             if len(cls_shape) <= len(hidden_shape) - 2:
                 cls_index = cls_index[..., tf.newaxis]
             # else:
-                # cls_index = cls_index[..., tf.newaxis]
-                # cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
+            # cls_index = cls_index[..., tf.newaxis]
+            # cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
             # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
             output = tf.gather(hidden_states, cls_index, batch_dims=len(hidden_shape) - 2)
-            output = tf.squeeze(output, axis=len(hidden_shape) - 2) # shape of output: (batch, num choices, hidden_size)
-        elif self.summary_type == 'attn':
+            output = tf.squeeze(
+                output, axis=len(hidden_shape) - 2
+            )  # shape of output: (batch, num choices, hidden_size)
+        elif self.summary_type == "attn":
             raise NotImplementedError
 
         if self.has_first_dropout:
@@ -541,12 +566,14 @@ class TFSequenceSummary(tf.keras.layers.Layer):
 
         return output
 
+
 def shape_list(x):
     """Deal with dynamic shape in tensorflow cleanly."""
     static = x.shape.as_list()
     dynamic = tf.shape(x)
     return [dynamic[i] if s is None else s for i, s in enumerate(static)]
 
+
 def get_initializer(initializer_range=0.02):
     """Creates a `tf.initializers.truncated_normal` with the given range.
     Args:
diff --git a/transformers/modeling_tf_xlm.py b/transformers/modeling_tf_xlm.py
index a7cc8ea481..2f443ae2fc 100644
--- a/transformers/modeling_tf_xlm.py
+++ b/transformers/modeling_tf_xlm.py
@@ -25,30 +25,34 @@ import numpy as np
 import tensorflow as tf
 
 from .configuration_xlm import XLMConfig
-from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, get_initializer, DUMMY_INPUTS
+from .modeling_tf_utils import (
+    TFPreTrainedModel,
+    TFSharedEmbeddings,
+    TFSequenceSummary,
+    shape_list,
+    get_initializer,
+    DUMMY_INPUTS,
+)
 from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
 TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-tf_model.h5",
-    'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-tf_model.h5",
-    'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-tf_model.h5",
-    'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-tf_model.h5",
-    'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-tf_model.h5",
-    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-tf_model.h5",
-    'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-tf_model.h5",
-    'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-tf_model.h5",
-    'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-tf_model.h5",
-    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-tf_model.h5",
+    "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-tf_model.h5",
+    "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-tf_model.h5",
+    "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-tf_model.h5",
+    "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-tf_model.h5",
+    "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-tf_model.h5",
+    "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-tf_model.h5",
+    "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-tf_model.h5",
+    "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-tf_model.h5",
+    "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-tf_model.h5",
+    "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-tf_model.h5",
 }
 
 
 def create_sinusoidal_embeddings(n_pos, dim, out):
-    position_enc = np.array([
-        [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
-        for pos in range(n_pos)
-    ])
+    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
     out[:, 0::2] = tf.constant(np.sin(position_enc[:, 0::2]))
     out[:, 1::2] = tf.constant(np.cos(position_enc[:, 1::2]))
 
@@ -78,8 +82,9 @@ def get_masks(slen, lengths, causal, padding_mask=None, dtype=tf.float32):
 
     # attention mask is the same as mask, or triangular inferior attention (causal)
     if causal:
-        attn_mask = tf.less_equal(tf.tile(alen[tf.newaxis, tf.newaxis, :], (bs, slen, 1)),
-                                  alen[tf.newaxis, :, tf.newaxis])
+        attn_mask = tf.less_equal(
+            tf.tile(alen[tf.newaxis, tf.newaxis, :], (bs, slen, 1)), alen[tf.newaxis, :, tf.newaxis]
+        )
     else:
         attn_mask = mask
 
@@ -106,10 +111,10 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
         self.n_heads = n_heads
         assert self.dim % self.n_heads == 0
 
-        self.q_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='q_lin')
-        self.k_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='k_lin')
-        self.v_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='v_lin')
-        self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='out_lin')
+        self.q_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin")
+        self.k_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin")
+        self.v_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin")
+        self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin")
         self.dropout = tf.keras.layers.Dropout(config.attention_dropout)
         self.pruned_heads = set()
 
@@ -125,7 +130,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
         # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
         bs, qlen, dim = shape_list(input)
         if kv is None:
-            klen = qlen if cache is None else cache['slen'] + qlen
+            klen = qlen if cache is None else cache["slen"] + qlen
         else:
             klen = shape_list(kv)[1]
         # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
@@ -141,40 +146,40 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
             """  compute context """
             return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
 
-        q = shape(self.q_lin(input))                                          # (bs, n_heads, qlen, dim_per_head)
+        q = shape(self.q_lin(input))  # (bs, n_heads, qlen, dim_per_head)
         if kv is None:
-            k = shape(self.k_lin(input))                                      # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v_lin(input))                                      # (bs, n_heads, qlen, dim_per_head)
+            k = shape(self.k_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(input))  # (bs, n_heads, qlen, dim_per_head)
         elif cache is None or self.layer_id not in cache:
             k = v = kv
-            k = shape(self.k_lin(k))                                          # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v_lin(v))                                          # (bs, n_heads, qlen, dim_per_head)
+            k = shape(self.k_lin(k))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(v))  # (bs, n_heads, qlen, dim_per_head)
 
         if cache is not None:
             if self.layer_id in cache:
                 if kv is None:
                     k_, v_ = cache[self.layer_id]
-                    k = tf.concat([k_, k], axis=2)                             # (bs, n_heads, klen, dim_per_head)
-                    v = tf.concat([v_, v], axis=2)                             # (bs, n_heads, klen, dim_per_head)
+                    k = tf.concat([k_, k], axis=2)  # (bs, n_heads, klen, dim_per_head)
+                    v = tf.concat([v_, v], axis=2)  # (bs, n_heads, klen, dim_per_head)
                 else:
                     k, v = cache[self.layer_id]
             cache[self.layer_id] = (k, v)
 
-        q = q / math.sqrt(dim_per_head)                                       # (bs, n_heads, qlen, dim_per_head)
-        scores = tf.matmul(q, k, transpose_b=True)                            # (bs, n_heads, qlen, klen)
-        mask = tf.reshape(mask, mask_reshape)                           # (bs, n_heads, qlen, klen)
+        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, qlen, dim_per_head)
+        scores = tf.matmul(q, k, transpose_b=True)  # (bs, n_heads, qlen, klen)
+        mask = tf.reshape(mask, mask_reshape)  # (bs, n_heads, qlen, klen)
         # scores.masked_fill_(mask, -float('inf'))                            # (bs, n_heads, qlen, klen)
         scores = scores - 1e30 * (1.0 - mask)
 
-        weights = tf.nn.softmax(scores, axis=-1)                              # (bs, n_heads, qlen, klen)
-        weights = self.dropout(weights, training=training)                    # (bs, n_heads, qlen, klen)
+        weights = tf.nn.softmax(scores, axis=-1)  # (bs, n_heads, qlen, klen)
+        weights = self.dropout(weights, training=training)  # (bs, n_heads, qlen, klen)
 
         # Mask heads if we want to
         if head_mask is not None:
             weights = weights * head_mask
 
-        context = tf.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
-        context = unshape(context)                                            # (bs, qlen, dim)
+        context = tf.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)  # (bs, qlen, dim)
 
         outputs = (self.out_lin(context),)
         if self.output_attentions:
@@ -183,11 +188,10 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
 
 
 class TFTransformerFFN(tf.keras.layers.Layer):
-
     def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
         super(TFTransformerFFN, self).__init__(**kwargs)
-        self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name='lin1')
-        self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name='lin2')
+        self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1")
+        self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2")
         self.act = tf.keras.layers.Activation(gelu) if config.gelu_activation else tf.keras.activations.relu
         self.dropout = tf.keras.layers.Dropout(config.dropout)
 
@@ -226,30 +230,36 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
         # assert len(self.id2lang) == len(self.lang2id) == self.n_langs
 
         # model parameters
-        self.dim = config.emb_dim       # 512 by default
+        self.dim = config.emb_dim  # 512 by default
         self.hidden_dim = self.dim * 4  # 2048 by default
-        self.n_heads = config.n_heads   # 8 by default
+        self.n_heads = config.n_heads  # 8 by default
         self.n_layers = config.n_layers
-        assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads'
+        assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads"
 
         # embeddings
         self.dropout = tf.keras.layers.Dropout(config.dropout)
         self.attention_dropout = tf.keras.layers.Dropout(config.attention_dropout)
 
-        self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
-                                                             self.dim,
-                                                             embeddings_initializer=get_initializer(config.embed_init_std),
-                                                             name='position_embeddings')
+        self.position_embeddings = tf.keras.layers.Embedding(
+            config.max_position_embeddings,
+            self.dim,
+            embeddings_initializer=get_initializer(config.embed_init_std),
+            name="position_embeddings",
+        )
         if config.sinusoidal_embeddings:
             raise NotImplementedError
             # create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
         if config.n_langs > 1 and config.use_lang_emb:
-            self.lang_embeddings = tf.keras.layers.Embedding(self.n_langs,
-                                                             self.dim,
-                                                             embeddings_initializer=get_initializer(config.embed_init_std),
-                                                             name='lang_embeddings')
-        self.embeddings = TFSharedEmbeddings(self.n_words, self.dim, initializer_range=config.embed_init_std, name='embeddings')  # padding_idx=self.pad_index)
-        self.layer_norm_emb = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm_emb')
+            self.lang_embeddings = tf.keras.layers.Embedding(
+                self.n_langs,
+                self.dim,
+                embeddings_initializer=get_initializer(config.embed_init_std),
+                name="lang_embeddings",
+            )
+        self.embeddings = TFSharedEmbeddings(
+            self.n_words, self.dim, initializer_range=config.embed_init_std, name="embeddings"
+        )  # padding_idx=self.pad_index)
+        self.layer_norm_emb = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm_emb")
 
         # transformer layers
         self.attentions = []
@@ -261,13 +271,21 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
         #     self.encoder_attn = []
 
         for i in range(self.n_layers):
-            self.attentions.append(TFMultiHeadAttention(self.n_heads, self.dim, config=config, name='attentions_._{}'.format(i)))
-            self.layer_norm1.append(tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm1_._{}'.format(i)))
+            self.attentions.append(
+                TFMultiHeadAttention(self.n_heads, self.dim, config=config, name="attentions_._{}".format(i))
+            )
+            self.layer_norm1.append(
+                tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1_._{}".format(i))
+            )
             # if self.is_decoder:
             #     self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
             #     self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
-            self.ffns.append(TFTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name='ffns_._{}'.format(i)))
-            self.layer_norm2.append(tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm2_._{}'.format(i)))
+            self.ffns.append(
+                TFTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name="ffns_._{}".format(i))
+            )
+            self.layer_norm2.append(
+                tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2_._{}".format(i))
+            )
 
         if hasattr(config, "pruned_heads"):
             pruned_heads = config.pruned_heads.copy().items()
@@ -276,7 +294,6 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
                 if self.attentions[int(layer)].n_heads == config.n_heads:
                     self.prune_heads({int(layer): list(map(int, heads))})
 
-
     def get_input_embeddings(self):
         return self.embeddings
 
@@ -290,9 +307,19 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
         """
         raise NotImplementedError
 
-    def call(self, inputs, attention_mask=None, langs=None, token_type_ids=None,
-             position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None,
-             training=False):  # removed: src_enc=None, src_len=None
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        langs=None,
+        token_type_ids=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):  # removed: src_enc=None, src_len=None
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -305,15 +332,15 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
             inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
             assert len(inputs) <= 9, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            langs = inputs.get('langs', langs)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            lengths = inputs.get('lengths', lengths)
-            cache = inputs.get('cache', cache)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            langs = inputs.get("langs", langs)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            lengths = inputs.get("lengths", lengths)
+            cache = inputs.get("cache", cache)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 9, "Too many inputs."
         else:
             input_ids = inputs
@@ -331,7 +358,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
             if input_ids is not None:
                 lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1)
             else:
-                lengths = tf.convert_to_tensor([slen]*bs, tf.int32)
+                lengths = tf.convert_to_tensor([slen] * bs, tf.int32)
         # mask = input_ids != self.pad_index
 
         # check inputs
@@ -375,7 +402,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
 
         # do not recompute cached elements
         if cache is not None and input_ids is not None:
-            _slen = slen - cache['slen']
+            _slen = slen - cache["slen"]
             input_ids = input_ids[:, -_slen:]
             position_ids = position_ids[:, -_slen:]
             if langs is not None:
@@ -430,7 +457,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
 
         # update cache length
         if cache is not None:
-            cache['slen'] += tensor.size(1)
+            cache["slen"] += tensor.size(1)
 
         # move back sequence length to dimension 0
         # tensor = tensor.transpose(0, 1)
@@ -447,6 +474,7 @@ class TFXLMPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = XLMConfig
     pretrained_model_archive_map = TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
@@ -460,7 +488,7 @@ class TFXLMPreTrainedModel(TFPreTrainedModel):
             langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
         else:
             langs_list = None
-        return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list}
+        return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list}
 
 
 XLM_START_DOCSTRING = r"""    The XLM model was proposed in
@@ -554,8 +582,12 @@ XLM_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare XLM Model transformer outputing raw hidden-states without any specific head on top.",
-                      XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare XLM Model transformer outputing raw hidden-states without any specific head on top.",
+    XLM_START_DOCSTRING,
+    XLM_INPUTS_DOCSTRING,
+)
 class TFXLMModel(TFXLMPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -581,20 +613,21 @@ class TFXLMModel(TFXLMPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXLMModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFXLMMainLayer(config, name='transformer')
+        self.transformer = TFXLMMainLayer(config, name="transformer")
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
         return outputs
 
 
-
 class TFXLMPredLayer(tf.keras.layers.Layer):
     """
     Prediction layer (cross_entropy or adaptive_softmax).
     """
+
     def __init__(self, config, input_embeddings, **kwargs):
         super(TFXLMPredLayer, self).__init__(**kwargs)
         self.asm = config.asm
@@ -614,10 +647,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
 
     def build(self, input_shape):
         # The output weights are the same as the input embeddings, but there is an output-only bias for each token.
-        self.bias = self.add_weight(shape=(self.n_words,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='bias')
+        self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias")
         super(TFXLMPredLayer, self).build(input_shape)
 
     def call(self, hidden_states):
@@ -626,9 +656,12 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
         return hidden_states
 
 
-@add_start_docstrings("""The XLM Model transformer with a language modeling head on top
+@add_start_docstrings(
+    """The XLM Model transformer with a language modeling head on top
     (linear layer with weights tied to the input embeddings). """,
-    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+    XLM_START_DOCSTRING,
+    XLM_INPUTS_DOCSTRING,
+)
 class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -654,10 +687,11 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXLMWithLMHeadModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFXLMMainLayer(config, name='transformer')
-        self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name='pred_layer_._proj')
+        self.transformer = TFXLMMainLayer(config, name="transformer")
+        self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
 
     def get_output_embeddings(self):
         return self.pred_layer.input_embeddings
@@ -672,9 +706,12 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
         return outputs
 
 
-@add_start_docstrings("""XLM Model with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """XLM Model with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+    XLM_START_DOCSTRING,
+    XLM_INPUTS_DOCSTRING,
+)
 class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -701,12 +738,13 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
         logits = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXLMForSequenceClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.transformer = TFXLMMainLayer(config, name='transformer')
-        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name='sequence_summary')
+        self.transformer = TFXLMMainLayer(config, name="transformer")
+        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
 
     def call(self, inputs, **kwargs):
         transformer_outputs = self.transformer(inputs, **kwargs)
@@ -718,9 +756,12 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
         return outputs
 
 
-@add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+    XLM_START_DOCSTRING,
+    XLM_INPUTS_DOCSTRING,
+)
 class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -748,12 +789,13 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
         start_scores, end_scores = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXLMForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFXLMMainLayer(config, name='transformer')
-        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.init_std),
-                                                name='qa_outputs')
+        self.transformer = TFXLMMainLayer(config, name="transformer")
+        self.qa_outputs = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs"
+        )
 
     def call(self, inputs, **kwargs):
         transformer_outputs = self.transformer(inputs, **kwargs)
@@ -765,6 +807,8 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
         start_logits = tf.squeeze(start_logits, axis=-1)
         end_logits = tf.squeeze(end_logits, axis=-1)
 
-        outputs = (start_logits, end_logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+        outputs = (start_logits, end_logits,) + transformer_outputs[
+            1:
+        ]  # Keep mems, hidden states, attentions if there are in it
 
         return outputs  # start_logits, end_logits, (hidden_states), (attentions)
diff --git a/transformers/modeling_tf_xlnet.py b/transformers/modeling_tf_xlnet.py
index 2f1fe150c6..c1ed720f96 100644
--- a/transformers/modeling_tf_xlnet.py
+++ b/transformers/modeling_tf_xlnet.py
@@ -35,8 +35,8 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-tf_model.h5",
-    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-tf_model.h5",
+    "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-tf_model.h5",
+    "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-tf_model.h5",
 }
 
 
@@ -45,8 +45,7 @@ def gelu(x):
         XLNet is using OpenAI GPT's gelu
         Also see https://arxiv.org/abs/1606.08415
     """
-    cdf = 0.5 * (1.0 + tf.tanh(
-        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
     return x * cdf
 
 
@@ -54,9 +53,11 @@ def swish(x):
     return x * tf.sigmoid(x)
 
 
-ACT2FN = {"gelu": tf.keras.layers.Activation(gelu),
-          "relu": tf.keras.activations.relu,
-          "swish": tf.keras.layers.Activation(swish)}
+ACT2FN = {
+    "gelu": tf.keras.layers.Activation(gelu),
+    "relu": tf.keras.activations.relu,
+    "swish": tf.keras.layers.Activation(swish),
+}
 
 
 class TFXLNetRelativeAttention(tf.keras.layers.Layer):
@@ -67,7 +68,8 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
         if config.d_model % config.n_head != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.d_model, config.n_head))
+                "heads (%d)" % (config.d_model, config.n_head)
+            )
 
         self.n_head = config.n_head
         self.d_head = config.d_head
@@ -75,38 +77,38 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
         self.scale = 1 / (config.d_head ** 0.5)
         self.initializer_range = config.initializer_range
 
-        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
         self.dropout = tf.keras.layers.Dropout(config.dropout)
 
     def build(self, input_shape):
         initializer = get_initializer(self.initializer_range)
-        self.q = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
-                                 initializer=initializer,
-                                 trainable=True, name='q')
-        self.k = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
-                                 initializer=initializer,
-                                 trainable=True, name='k')
-        self.v = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
-                                 initializer=initializer,
-                                 trainable=True, name='v')
-        self.o = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
-                                 initializer=initializer,
-                                 trainable=True, name='o')
-        self.r = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
-                                 initializer=initializer,
-                                 trainable=True, name='r')
-        self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
-                                        initializer='zeros',
-                                        trainable=True, name='r_r_bias')
-        self.r_s_bias = self.add_weight(shape=(self.n_head, self.d_head),
-                                        initializer='zeros',
-                                        trainable=True, name='r_s_bias')
-        self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
-                                        initializer='zeros',
-                                        trainable=True, name='r_w_bias')
-        self.seg_embed = self.add_weight(shape=(2, self.n_head, self.d_head),
-                                        initializer=initializer,
-                                        trainable=True, name='seg_embed')
+        self.q = self.add_weight(
+            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="q"
+        )
+        self.k = self.add_weight(
+            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="k"
+        )
+        self.v = self.add_weight(
+            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="v"
+        )
+        self.o = self.add_weight(
+            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="o"
+        )
+        self.r = self.add_weight(
+            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="r"
+        )
+        self.r_r_bias = self.add_weight(
+            shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
+        )
+        self.r_s_bias = self.add_weight(
+            shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_s_bias"
+        )
+        self.r_w_bias = self.add_weight(
+            shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
+        )
+        self.seg_embed = self.add_weight(
+            shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed"
+        )
         super(TFXLNetRelativeAttention, self).build(input_shape)
 
     def prune_heads(self, heads):
@@ -130,18 +132,18 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
         q_head, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask, head_mask = inputs
 
         # content based attention score
-        ac = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_w_bias, k_head_h)
+        ac = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_w_bias, k_head_h)
 
         # position based attention score
-        bd = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_r_bias, k_head_r)
+        bd = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_r_bias, k_head_r)
         bd = self.rel_shift(bd, klen=shape_list(ac)[1])
 
         # segment based attention score
         if seg_mat is None:
             ef = 0
         else:
-            ef = tf.einsum('ibnd,snd->ibns', q_head + self.r_s_bias, self.seg_embed)
-            ef = tf.einsum('ijbs,ibns->ijbn', seg_mat, ef)
+            ef = tf.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed)
+            ef = tf.einsum("ijbs,ibns->ijbn", seg_mat, ef)
 
         # merge attention scores and perform masking
         attn_score = (ac + bd + ef) * self.scale
@@ -162,7 +164,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
             attn_prob = attn_prob * head_mask
 
         # attention output
-        attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, v_head_h)
+        attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, v_head_h)
 
         if self.output_attentions:
             return attn_vec, attn_prob
@@ -174,7 +176,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
         # post-attention projection (back to `d_model`)
         h, attn_vec = inputs
 
-        attn_out = tf.einsum('ibnd,hnd->ibh', attn_vec, self.o)
+        attn_out = tf.einsum("ibnd,hnd->ibh", attn_vec, self.o)
 
         attn_out = self.dropout(attn_out, training=training)
 
@@ -185,8 +187,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
         return output
 
     def call(self, inputs, training=False):
-        (h, g, attn_mask_h, attn_mask_g,
-         r, seg_mat, mems, target_mapping, head_mask) = inputs
+        (h, g, attn_mask_h, attn_mask_g, r, seg_mat, mems, target_mapping, head_mask) = inputs
 
         if g is not None:
             ###### Two-stream attention with relative positional encoding.
@@ -197,22 +198,22 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
                 cat = h
 
             # content-based key head
-            k_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.k)
+            k_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.k)
 
             # content-based value head
-            v_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.v)
+            v_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.v)
 
             # position-based key head
-            k_head_r = tf.einsum('ibh,hnd->ibnd', r, self.r)
+            k_head_r = tf.einsum("ibh,hnd->ibnd", r, self.r)
 
             ##### h-stream
             # content-stream query head
-            q_head_h = tf.einsum('ibh,hnd->ibnd', h, self.q)
+            q_head_h = tf.einsum("ibh,hnd->ibnd", h, self.q)
 
             # core attention ops
             attn_vec_h = self.rel_attn_core(
-                [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask],
-                training=training)
+                [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask], training=training
+            )
 
             if self.output_attentions:
                 attn_vec_h, attn_prob_h = attn_vec_h
@@ -222,23 +223,23 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
 
             ##### g-stream
             # query-stream query head
-            q_head_g = tf.einsum('ibh,hnd->ibnd', g, self.q)
+            q_head_g = tf.einsum("ibh,hnd->ibnd", g, self.q)
 
             # core attention ops
             if target_mapping is not None:
-                q_head_g = tf.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping)
+                q_head_g = tf.einsum("mbnd,mlb->lbnd", q_head_g, target_mapping)
                 attn_vec_g = self.rel_attn_core(
-                    [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask],
-                    training=training)
+                    [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask], training=training
+                )
 
                 if self.output_attentions:
                     attn_vec_g, attn_prob_g = attn_vec_g
 
-                attn_vec_g = tf.einsum('lbnd,mlb->mbnd', attn_vec_g, target_mapping)
+                attn_vec_g = tf.einsum("lbnd,mlb->mbnd", attn_vec_g, target_mapping)
             else:
                 attn_vec_g = self.rel_attn_core(
-                    [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask],
-                    training=training)
+                    [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask], training=training
+                )
 
                 if self.output_attentions:
                     attn_vec_g, attn_prob_g = attn_vec_g
@@ -257,17 +258,17 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
                 cat = h
 
             # content heads
-            q_head_h = tf.einsum('ibh,hnd->ibnd', h, self.q)
-            k_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.k)
-            v_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.v)
+            q_head_h = tf.einsum("ibh,hnd->ibnd", h, self.q)
+            k_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.k)
+            v_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.v)
 
             # positional heads
-            k_head_r = tf.einsum('ibh,hnd->ibnd', r, self.r)
+            k_head_r = tf.einsum("ibh,hnd->ibnd", r, self.r)
 
             # core attention ops
             attn_vec = self.rel_attn_core(
-                [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask],
-                training=training)
+                [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask], training=training
+            )
 
             if self.output_attentions:
                 attn_vec, attn_prob = attn_vec
@@ -281,19 +282,21 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
             outputs = outputs + (attn_prob,)
         return outputs
 
+
 class TFXLNetFeedForward(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFXLNetFeedForward, self).__init__(**kwargs)
-        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
-        self.layer_1 = tf.keras.layers.Dense(config.d_inner,
-                                             kernel_initializer=get_initializer(config.initializer_range),
-                                             name='layer_1')
-        self.layer_2 = tf.keras.layers.Dense(config.d_model,
-                                             kernel_initializer=get_initializer(config.initializer_range),
-                                             name='layer_2')
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.layer_1 = tf.keras.layers.Dense(
+            config.d_inner, kernel_initializer=get_initializer(config.initializer_range), name="layer_1"
+        )
+        self.layer_2 = tf.keras.layers.Dense(
+            config.d_model, kernel_initializer=get_initializer(config.initializer_range), name="layer_2"
+        )
         self.dropout = tf.keras.layers.Dropout(config.dropout)
-        if isinstance(config.ff_activation, str) or \
-                (sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)):
+        if isinstance(config.ff_activation, str) or (
+            sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)
+        ):
             self.activation_function = ACT2FN[config.ff_activation]
         else:
             self.activation_function = config.ff_activation
@@ -308,11 +311,12 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
         output = self.layer_norm(output + inp)
         return output
 
+
 class TFXLNetLayer(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFXLNetLayer, self).__init__(**kwargs)
-        self.rel_attn = TFXLNetRelativeAttention(config, name='rel_attn')
-        self.ff = TFXLNetFeedForward(config, name='ff')
+        self.rel_attn = TFXLNetRelativeAttention(config, name="rel_attn")
+        self.ff = TFXLNetFeedForward(config, name="ff")
         self.dropout = tf.keras.layers.Dropout(config.dropout)
 
     def call(self, inputs, training=False):
@@ -336,10 +340,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='bias')
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
         super(TFXLNetLMHead, self).build(input_shape)
 
     def call(self, hidden_states):
@@ -366,8 +367,10 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
         self.use_bfloat16 = config.use_bfloat16
         self.initializer_range = config.initializer_range
 
-        self.word_embedding = TFSharedEmbeddings(config.vocab_size, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
-        self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)]
+        self.word_embedding = TFSharedEmbeddings(
+            config.vocab_size, config.d_model, initializer_range=config.initializer_range, name="word_embedding"
+        )
+        self.layer = [TFXLNetLayer(config, name="layer_._{}".format(i)) for i in range(config.n_layer)]
         self.dropout = tf.keras.layers.Dropout(config.dropout)
 
     def get_input_embeddings(self):
@@ -375,9 +378,9 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
 
     def build(self, input_shape):
         initializer = get_initializer(self.initializer_range)
-        self.mask_emb = self.add_weight(shape=(1, 1, self.d_model),
-                                 initializer=initializer,
-                                 trainable=True, name='mask_emb')
+        self.mask_emb = self.add_weight(
+            shape=(1, 1, self.d_model), initializer=initializer, trainable=True, name="mask_emb"
+        )
 
     def _resize_token_embeddings(self, new_num_tokens):
         raise NotImplementedError
@@ -417,18 +420,18 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
     def cache_mem(self, curr_out, prev_mem):
         """cache hidden states into memory."""
         if self.reuse_len is not None and self.reuse_len > 0:
-            curr_out = curr_out[:self.reuse_len]
+            curr_out = curr_out[: self.reuse_len]
 
         if prev_mem is None:
-            new_mem = curr_out[-self.mem_len:]
+            new_mem = curr_out[-self.mem_len :]
         else:
-            new_mem = tf.concat([prev_mem, curr_out], 0)[-self.mem_len:]
+            new_mem = tf.concat([prev_mem, curr_out], 0)[-self.mem_len :]
 
         return tf.stop_gradient(new_mem)
 
     @staticmethod
     def positional_embedding(pos_seq, inv_freq, bsz=None):
-        sinusoid_inp = tf.einsum('i,d->id', pos_seq, inv_freq)
+        sinusoid_inp = tf.einsum("i,d->id", pos_seq, inv_freq)
         pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], axis=-1)
         pos_emb = pos_emb[:, None, :]
 
@@ -444,14 +447,14 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
             freq_seq = tf.cast(freq_seq, dtype=dtype)
         inv_freq = 1 / (10000 ** (freq_seq / self.d_model))
 
-        if self.attn_type == 'bi':
+        if self.attn_type == "bi":
             # beg, end = klen - 1, -qlen
             beg, end = klen, -qlen
-        elif self.attn_type == 'uni':
+        elif self.attn_type == "uni":
             # beg, end = klen - 1, -1
             beg, end = klen, -1
         else:
-            raise ValueError('Unknown `attn_type` {}.'.format(self.attn_type))
+            raise ValueError("Unknown `attn_type` {}.".format(self.attn_type))
 
         if self.bi_data:
             fwd_pos_seq = tf.range(beg, end, -1.0)
@@ -467,9 +470,9 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
 
             if bsz is not None:
                 # With bi_data, the batch size should be divisible by 2.
-                assert bsz%2 == 0
-                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz//2)
-                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz//2)
+                assert bsz % 2 == 0
+                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2)
+                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2)
             else:
                 fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
                 bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
@@ -485,8 +488,19 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
 
         return pos_emb
 
-    def call(self, inputs, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-            token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, training=False):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -499,15 +513,15 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
             inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
             assert len(inputs) <= 9, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            mems = inputs.get('mems', mems)
-            perm_mask = inputs.get('perm_mask', perm_mask)
-            target_mapping = inputs.get('target_mapping', target_mapping)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            input_mask = inputs.get('input_mask', input_mask)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            mems = inputs.get("mems", mems)
+            perm_mask = inputs.get("perm_mask", perm_mask)
+            target_mapping = inputs.get("target_mapping", target_mapping)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            input_mask = inputs.get("input_mask", input_mask)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 9, "Too many inputs."
         else:
             input_ids = inputs
@@ -540,17 +554,19 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
 
         ##### Attention mask
         # causal attention mask
-        if self.attn_type == 'uni':
+        if self.attn_type == "uni":
             attn_mask = self.create_mask(qlen, mlen)
             attn_mask = attn_mask[:, :, None, None]
-        elif self.attn_type == 'bi':
+        elif self.attn_type == "bi":
             attn_mask = None
         else:
-            raise ValueError('Unsupported attention type: {}'.format(self.attn_type))
+            raise ValueError("Unsupported attention type: {}".format(self.attn_type))
 
         # data mask: input mask & perm mask
-        assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) " \
+        assert input_mask is None or attention_mask is None, (
+            "You can only use one of input_mask (uses 1 for padding) "
             "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one."
+        )
         if input_mask is None and attention_mask is not None:
             input_mask = 1.0 - tf.cast(attention_mask, dtype=dtype_float)
         if input_mask is not None and perm_mask is not None:
@@ -564,8 +580,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
 
         if data_mask is not None:
             # all mems can be attended to
-            mems_mask = tf.zeros([shape_list(data_mask)[0], mlen, bsz],
-                                dtype=dtype_float)
+            mems_mask = tf.zeros([shape_list(data_mask)[0], mlen, bsz], dtype=dtype_float)
             data_mask = tf.concat([mems_mask, data_mask], axis=1)
             if attn_mask is None:
                 attn_mask = data_mask[:, :, :, None]
@@ -590,9 +605,9 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
         output_h = self.dropout(word_emb_k, training=training)
         if target_mapping is not None:
             word_emb_q = tf.tile(self.mask_emb, [shape_list(target_mapping)[0], bsz, 1])
-        # else:  # We removed the inp_q input which was same as target mapping
-        #     inp_q_ext = inp_q[:, :, None]
-        #     word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
+            # else:  # We removed the inp_q input which was same as target mapping
+            #     inp_q_ext = inp_q[:, :, None]
+            #     word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
             output_g = self.dropout(word_emb_q, training=training)
         else:
             output_g = None
@@ -604,9 +619,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
             cat_ids = tf.concat([mem_pad, token_type_ids], 0)
 
             # `1` indicates not in the same segment [qlen x klen x bsz]
-            seg_mat = tf.cast(
-                tf.logical_not(tf.equal(token_type_ids[:, None], cat_ids[None, :])),
-                tf.int32)
+            seg_mat = tf.cast(tf.logical_not(tf.equal(token_type_ids[:, None], cat_ids[None, :])), tf.int32)
             seg_mat = tf.one_hot(seg_mat, 2, dtype=dtype_float)
         else:
             seg_mat = None
@@ -626,7 +639,9 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
                 head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
                 head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.n_layer
 
@@ -643,9 +658,10 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
             if self.output_hidden_states:
                 hidden_states.append((output_h, output_g) if output_g is not None else output_h)
 
-            outputs = layer_module([output_h, output_g, non_tgt_mask, attn_mask,
-                                    pos_emb, seg_mat, mems[i], target_mapping,
-                                    head_mask[i]], training=training)
+            outputs = layer_module(
+                [output_h, output_g, non_tgt_mask, attn_mask, pos_emb, seg_mat, mems[i], target_mapping, head_mask[i]],
+                training=training,
+            )
             output_h, output_g = outputs[:2]
             if self.output_attentions:
                 attentions.append(outputs[2])
@@ -679,6 +695,7 @@ class TFXLNetPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = XLNetConfig
     pretrained_model_archive_map = TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
@@ -784,8 +801,12 @@ XLNET_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.",
-                      XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.",
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class TFXLNetModel(TFXLNetPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -816,18 +837,22 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXLNetModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFXLNetMainLayer(config, name='transformer')
+        self.transformer = TFXLNetMainLayer(config, name="transformer")
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
         return outputs
 
 
-@add_start_docstrings("""XLNet Model with a language modeling head on top
+@add_start_docstrings(
+    """XLNet Model with a language modeling head on top
     (linear layer with weights tied to the input embeddings). """,
-    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -865,10 +890,11 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
         next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXLNetLMHeadModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFXLNetMainLayer(config, name='transformer')
-        self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name='lm_loss')
+        self.transformer = TFXLNetMainLayer(config, name="transformer")
+        self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name="lm_loss")
 
     def get_output_embeddings(self):
         return self.lm_loss.input_embeddings
@@ -883,9 +909,12 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
         return outputs  # return logits, (mems), (hidden states), (attentions)
 
 
-@add_start_docstrings("""XLNet Model with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """XLNet Model with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -916,15 +945,18 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
         logits = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXLNetForSequenceClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.transformer = TFXLNetMainLayer(config, name='transformer')
-        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.initializer_range, name='sequence_summary')
-        self.logits_proj = tf.keras.layers.Dense(config.num_labels,
-                                                 kernel_initializer=get_initializer(config.initializer_range),
-                                                 name='logits_proj')
+        self.transformer = TFXLNetMainLayer(config, name="transformer")
+        self.sequence_summary = TFSequenceSummary(
+            config, initializer_range=config.initializer_range, name="sequence_summary"
+        )
+        self.logits_proj = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
+        )
 
     def call(self, inputs, **kwargs):
         transformer_outputs = self.transformer(inputs, **kwargs)
@@ -938,9 +970,12 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
         return outputs  # return logits, (mems), (hidden states), (attentions)
 
 
-@add_start_docstrings("""XLNet Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """XLNet Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -971,14 +1006,15 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
         scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXLNetForTokenClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.transformer = TFXLNetMainLayer(config, name='transformer')
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.transformer = TFXLNetMainLayer(config, name="transformer")
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
 
     def call(self, inputs, **kwargs):
         transformer_outputs = self.transformer(inputs, **kwargs)
@@ -1027,12 +1063,13 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
         start_scores, end_scores = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXLNetForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFXLNetMainLayer(config, name='transformer')
-        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='qa_outputs')
+        self.transformer = TFXLNetMainLayer(config, name="transformer")
+        self.qa_outputs = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
 
     def call(self, inputs, **kwargs):
         transformer_outputs = self.transformer(inputs, **kwargs)
@@ -1044,10 +1081,13 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
         start_logits = tf.squeeze(start_logits, axis=-1)
         end_logits = tf.squeeze(end_logits, axis=-1)
 
-        outputs = (start_logits, end_logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+        outputs = (start_logits, end_logits,) + transformer_outputs[
+            1:
+        ]  # Keep mems, hidden states, attentions if there are in it
 
         return outputs  # start_logits, end_logits, (mems), (hidden_states), (attentions)
 
+
 # @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
 #     the hidden-states output to compute `span start logits` and `span end logits`). """,
 #     XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
diff --git a/transformers/modeling_transfo_xl.py b/transformers/modeling_transfo_xl.py
index 70ef4aea3e..cee61ed37a 100644
--- a/transformers/modeling_transfo_xl.py
+++ b/transformers/modeling_transfo_xl.py
@@ -42,65 +42,62 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin",
+    "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin",
 }
 
+
 def build_tf_to_pytorch_map(model, config):
     """ A map of modules from TF to PyTorch.
         This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.
     """
     tf_to_pt_map = {}
 
-    if hasattr(model, 'transformer'):
+    if hasattr(model, "transformer"):
         # We are loading in a TransfoXLLMHeadModel => we will load also the Adaptive Softmax
-        tf_to_pt_map.update({
-            "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight,
-            "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias})
-        for i, (out_l, proj_l, tie_proj) in enumerate(zip(
-                                model.crit.out_layers,
-                                model.crit.out_projs,
-                                config.tie_projs)):
+        tf_to_pt_map.update(
+            {
+                "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight,
+                "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias,
+            }
+        )
+        for i, (out_l, proj_l, tie_proj) in enumerate(
+            zip(model.crit.out_layers, model.crit.out_projs, config.tie_projs)
+        ):
             layer_str = "transformer/adaptive_softmax/cutoff_%d/" % i
             if config.tie_weight:
-                tf_to_pt_map.update({
-                    layer_str + 'b': out_l.bias})
+                tf_to_pt_map.update({layer_str + "b": out_l.bias})
             else:
                 raise NotImplementedError
                 # I don't think this is implemented in the TF code
-                tf_to_pt_map.update({
-                    layer_str + 'lookup_table': out_l.weight,
-                    layer_str + 'b': out_l.bias})
+                tf_to_pt_map.update({layer_str + "lookup_table": out_l.weight, layer_str + "b": out_l.bias})
             if not tie_proj:
-                tf_to_pt_map.update({
-                    layer_str + 'proj': proj_l
-                    })
+                tf_to_pt_map.update({layer_str + "proj": proj_l})
         # Now load the rest of the transformer
         model = model.transformer
 
     # Embeddings
     for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):
         layer_str = "transformer/adaptive_embed/cutoff_%d/" % i
-        tf_to_pt_map.update({
-            layer_str + 'lookup_table': embed_l.weight,
-            layer_str + 'proj_W': proj_l
-            })
+        tf_to_pt_map.update({layer_str + "lookup_table": embed_l.weight, layer_str + "proj_W": proj_l})
 
     # Transformer blocks
     for i, b in enumerate(model.layers):
         layer_str = "transformer/layer_%d/" % i
-        tf_to_pt_map.update({
-            layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight,
-            layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias,
-            layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight,
-            layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight,
-            layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight,
-            layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight,
-            layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias,
-            layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight,
-            layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias,
-            layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight,
-            layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias,
-        })
+        tf_to_pt_map.update(
+            {
+                layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight,
+                layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias,
+                layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight,
+                layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight,
+                layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight,
+                layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight,
+                layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias,
+                layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight,
+                layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias,
+                layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight,
+                layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias,
+            }
+        )
 
     # Relative positioning biases
     if config.untie_r:
@@ -112,11 +109,10 @@ def build_tf_to_pytorch_map(model, config):
     else:
         r_r_list = [model.r_r_bias]
         r_w_list = [model.r_w_bias]
-    tf_to_pt_map.update({
-        'transformer/r_r_bias': r_r_list,
-        'transformer/r_w_bias': r_w_list})
+    tf_to_pt_map.update({"transformer/r_r_bias": r_r_list, "transformer/r_w_bias": r_w_list})
     return tf_to_pt_map
 
+
 def load_tf_weights_in_transfo_xl(model, config, tf_path):
     """ Load tf checkpoints in a pytorch model
     """
@@ -124,8 +120,10 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise
     # Build TF to PyTorch weights loading map
     tf_to_pt_map = build_tf_to_pytorch_map(model, config)
@@ -143,9 +141,9 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
         array = tf_weights[name]
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
-        if 'kernel' in name or 'proj' in name:
+        if "kernel" in name or "proj" in name:
             array = np.transpose(array)
-        if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1:
+        if ("r_r_bias" in name or "r_w_bias" in name) and len(pointer) > 1:
             # Here we will split the TF weigths
             assert len(pointer) == array.shape[0]
             for i, p_i in enumerate(pointer):
@@ -166,10 +164,10 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
             logger.info("Initialize PyTorch weight {}".format(name))
             pointer.data = torch.from_numpy(array)
         tf_weights.pop(name, None)
-        tf_weights.pop(name + '/Adam', None)
-        tf_weights.pop(name + '/Adam_1', None)
+        tf_weights.pop(name + "/Adam", None)
+        tf_weights.pop(name + "/Adam_1", None)
 
-    logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys())))
     return model
 
 
@@ -180,17 +178,16 @@ class PositionalEmbedding(nn.Module):
         self.demb = demb
 
         inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))
-        self.register_buffer('inv_freq', inv_freq)
+        self.register_buffer("inv_freq", inv_freq)
 
     def forward(self, pos_seq, bsz=None):
         sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
         pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
 
         if bsz is not None:
-            return pos_emb[:,None,:].expand(-1, bsz, -1)
+            return pos_emb[:, None, :].expand(-1, bsz, -1)
         else:
-            return pos_emb[:,None,:]
-
+            return pos_emb[:, None, :]
 
 
 class PositionwiseFF(nn.Module):
@@ -202,7 +199,8 @@ class PositionwiseFF(nn.Module):
         self.dropout = dropout
 
         self.CoreNet = nn.Sequential(
-            nn.Linear(d_model, d_inner), nn.ReLU(inplace=True),
+            nn.Linear(d_model, d_inner),
+            nn.ReLU(inplace=True),
             nn.Dropout(dropout),
             nn.Linear(d_inner, d_model),
             nn.Dropout(dropout),
@@ -230,10 +228,22 @@ class PositionwiseFF(nn.Module):
 
 
 class RelPartialLearnableMultiHeadAttn(nn.Module):
-    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
-                 tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False,
-                 r_r_bias=None, r_w_bias=None, output_attentions=False,
-                 layer_norm_epsilon=1e-5):
+    def __init__(
+        self,
+        n_head,
+        d_model,
+        d_head,
+        dropout,
+        dropatt=0,
+        tgt_len=None,
+        ext_len=None,
+        mem_len=None,
+        pre_lnorm=False,
+        r_r_bias=None,
+        r_w_bias=None,
+        output_attentions=False,
+        layer_norm_epsilon=1e-5,
+    ):
         super(RelPartialLearnableMultiHeadAttn, self).__init__()
 
         self.output_attentions = output_attentions
@@ -254,7 +264,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
 
         self.pre_lnorm = pre_lnorm
 
-        if r_r_bias is None or r_w_bias is None: # Biases are not shared
+        if r_r_bias is None or r_w_bias is None:  # Biases are not shared
             self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
             self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
         else:
@@ -299,18 +309,18 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
 
         klen = w_head_k.size(0)
 
-        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
-        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
-        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
+        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)  # qlen x bsz x n_head x d_head
+        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)  # qlen x bsz x n_head x d_head
+        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)  # qlen x bsz x n_head x d_head
 
-        r_head_k = r_head_k.view(rlen, self.n_head, self.d_head)                # qlen x n_head x d_head
+        r_head_k = r_head_k.view(rlen, self.n_head, self.d_head)  # qlen x n_head x d_head
 
         #### compute attention score
-        rw_head_q = w_head_q + self.r_w_bias                                    # qlen x bsz x n_head x d_head
-        AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k))             # qlen x klen x bsz x n_head
+        rw_head_q = w_head_q + self.r_w_bias  # qlen x bsz x n_head x d_head
+        AC = torch.einsum("ibnd,jbnd->ijbn", (rw_head_q, w_head_k))  # qlen x klen x bsz x n_head
 
         rr_head_q = w_head_q + self.r_r_bias
-        BD = torch.einsum('ibnd,jnd->ijbn', (rr_head_q, r_head_k))              # qlen x klen x bsz x n_head
+        BD = torch.einsum("ibnd,jnd->ijbn", (rr_head_q, r_head_k))  # qlen x klen x bsz x n_head
         BD = self._rel_shift(BD)
 
         # [qlen x klen x bsz x n_head]
@@ -319,21 +329,19 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
 
         #### compute attention probability
         if attn_mask is not None and torch.sum(attn_mask).item():
-            attn_mask = (attn_mask == 1)  # Switch to bool
+            attn_mask = attn_mask == 1  # Switch to bool
             if attn_mask.dim() == 2:
                 if next(self.parameters()).dtype == torch.float16:
-                    attn_score = attn_score.float().masked_fill(
-                        attn_mask[None,:,:,None], -65000).type_as(attn_score)
+                    attn_score = (
+                        attn_score.float().masked_fill(attn_mask[None, :, :, None], -65000).type_as(attn_score)
+                    )
                 else:
-                    attn_score = attn_score.float().masked_fill(
-                        attn_mask[None,:,:,None], -1e30).type_as(attn_score)
+                    attn_score = attn_score.float().masked_fill(attn_mask[None, :, :, None], -1e30).type_as(attn_score)
             elif attn_mask.dim() == 3:
                 if next(self.parameters()).dtype == torch.float16:
-                    attn_score = attn_score.float().masked_fill(
-                        attn_mask[:,:,:,None], -65000).type_as(attn_score)
+                    attn_score = attn_score.float().masked_fill(attn_mask[:, :, :, None], -65000).type_as(attn_score)
                 else:
-                    attn_score = attn_score.float().masked_fill(
-                        attn_mask[:,:,:,None], -1e30).type_as(attn_score)
+                    attn_score = attn_score.float().masked_fill(attn_mask[:, :, :, None], -1e30).type_as(attn_score)
 
         # [qlen x klen x bsz x n_head]
         attn_prob = F.softmax(attn_score, dim=1)
@@ -344,11 +352,10 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
             attn_prob = attn_prob * head_mask
 
         #### compute attention vector
-        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))
+        attn_vec = torch.einsum("ijbn,jbnd->ibnd", (attn_prob, w_head_v))
 
         # [qlen x bsz x n_head x d_head]
-        attn_vec = attn_vec.contiguous().view(
-            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+        attn_vec = attn_vec.contiguous().view(attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
 
         ##### linear projection
         attn_out = self.o_net(attn_vec)
@@ -368,21 +375,19 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
 
 
 class RelPartialLearnableDecoderLayer(nn.Module):
-    def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5,
-                 **kwargs):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5, **kwargs):
         super(RelPartialLearnableDecoderLayer, self).__init__()
 
-        self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model,
-                            d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs)
-        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
-                                     pre_lnorm=kwargs.get('pre_lnorm'),
-                                     layer_norm_epsilon=layer_norm_epsilon)
+        self.dec_attn = RelPartialLearnableMultiHeadAttn(
+            n_head, d_model, d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs
+        )
+        self.pos_ff = PositionwiseFF(
+            d_model, d_inner, dropout, pre_lnorm=kwargs.get("pre_lnorm"), layer_norm_epsilon=layer_norm_epsilon
+        )
 
     def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None):
 
-        attn_outputs = self.dec_attn(dec_inp, r,
-                               attn_mask=dec_attn_mask,
-                               mems=mems, head_mask=head_mask)
+        attn_outputs = self.dec_attn(dec_inp, r, attn_mask=dec_attn_mask, mems=mems, head_mask=head_mask)
         ff_output = self.pos_ff(attn_outputs[0])
 
         outputs = [ff_output] + attn_outputs[1:]
@@ -391,8 +396,7 @@ class RelPartialLearnableDecoderLayer(nn.Module):
 
 
 class AdaptiveEmbedding(nn.Module):
-    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
-                 sample_softmax=False):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, sample_softmax=False):
         super(AdaptiveEmbedding, self).__init__()
 
         self.n_token = n_token
@@ -409,28 +413,25 @@ class AdaptiveEmbedding(nn.Module):
         self.emb_layers = nn.ModuleList()
         self.emb_projs = nn.ParameterList()
         if div_val == 1:
-            self.emb_layers.append(
-                nn.Embedding(n_token, d_embed, sparse=sample_softmax>0)
-            )
+            self.emb_layers.append(nn.Embedding(n_token, d_embed, sparse=sample_softmax > 0))
             if d_proj != d_embed:
                 self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed)))
         else:
             for i in range(len(self.cutoffs)):
-                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
                 d_emb_i = d_embed // (div_val ** i)
-                self.emb_layers.append(nn.Embedding(r_idx-l_idx, d_emb_i))
+                self.emb_layers.append(nn.Embedding(r_idx - l_idx, d_emb_i))
                 self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)))
 
     def forward(self, inp):
         if self.div_val == 1:
             embed = self.emb_layers[0](inp)
             if self.d_proj != self.d_embed:
-                embed  = F.linear(embed, self.emb_projs[0])
+                embed = F.linear(embed, self.emb_projs[0])
         else:
             param = next(self.parameters())
             inp_flat = inp.view(-1)
-            emb_flat = torch.zeros([inp_flat.size(0), self.d_proj],
-                dtype=param.dtype, device=param.device)
+            emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], dtype=param.dtype, device=param.device)
             for i in range(len(self.cutoffs)):
                 l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
 
@@ -458,15 +459,16 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = TransfoXLConfig
     pretrained_model_archive_map = TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_transfo_xl
     base_model_prefix = "transformer"
 
     def _init_weight(self, weight):
-        if self.config.init == 'uniform':
+        if self.config.init == "uniform":
             nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
-        elif self.config.init == 'normal':
+        elif self.config.init == "normal":
             nn.init.normal_(weight, 0.0, self.config.init_std)
 
     def _init_bias(self, bias):
@@ -476,41 +478,41 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
         """ Initialize the weights.
         """
         classname = m.__class__.__name__
-        if classname.find('Linear') != -1:
-            if hasattr(m, 'weight') and m.weight is not None:
+        if classname.find("Linear") != -1:
+            if hasattr(m, "weight") and m.weight is not None:
                 self._init_weight(m.weight)
-            if hasattr(m, 'bias') and m.bias is not None:
+            if hasattr(m, "bias") and m.bias is not None:
                 self._init_bias(m.bias)
-        elif classname.find('AdaptiveEmbedding') != -1:
-            if hasattr(m, 'emb_projs'):
+        elif classname.find("AdaptiveEmbedding") != -1:
+            if hasattr(m, "emb_projs"):
                 for i in range(len(m.emb_projs)):
                     if m.emb_projs[i] is not None:
                         nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std)
-        elif classname.find('Embedding') != -1:
-            if hasattr(m, 'weight'):
+        elif classname.find("Embedding") != -1:
+            if hasattr(m, "weight"):
                 self._init_weight(m.weight)
-        elif classname.find('ProjectedAdaptiveLogSoftmax') != -1:
-            if hasattr(m, 'cluster_weight') and m.cluster_weight is not None:
+        elif classname.find("ProjectedAdaptiveLogSoftmax") != -1:
+            if hasattr(m, "cluster_weight") and m.cluster_weight is not None:
                 self._init_weight(m.cluster_weight)
-            if hasattr(m, 'cluster_bias') and m.cluster_bias is not None:
+            if hasattr(m, "cluster_bias") and m.cluster_bias is not None:
                 self._init_bias(m.cluster_bias)
-            if hasattr(m, 'out_projs'):
+            if hasattr(m, "out_projs"):
                 for i in range(len(m.out_projs)):
                     if m.out_projs[i] is not None:
                         nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std)
-        elif classname.find('LayerNorm') != -1:
-            if hasattr(m, 'weight'):
+        elif classname.find("LayerNorm") != -1:
+            if hasattr(m, "weight"):
                 nn.init.normal_(m.weight, 1.0, self.config.init_std)
-            if hasattr(m, 'bias') and m.bias is not None:
+            if hasattr(m, "bias") and m.bias is not None:
                 self._init_bias(m.bias)
         else:
-            if hasattr(m, 'r_emb'):
+            if hasattr(m, "r_emb"):
                 self._init_weight(m.r_emb)
-            if hasattr(m, 'r_w_bias'):
+            if hasattr(m, "r_w_bias"):
                 self._init_weight(m.r_w_bias)
-            if hasattr(m, 'r_r_bias'):
+            if hasattr(m, "r_r_bias"):
                 self._init_weight(m.r_r_bias)
-            if hasattr(m, 'r_bias'):
+            if hasattr(m, "r_bias"):
                 self._init_bias(m.r_bias)
 
 
@@ -559,8 +561,12 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
-                      TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    TRANSFO_XL_START_DOCSTRING,
+    TRANSFO_XL_INPUTS_DOCSTRING,
+)
 class TransfoXLModel(TransfoXLPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -587,6 +593,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         last_hidden_states, mems = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(TransfoXLModel, self).__init__(config)
         self.output_attentions = config.output_attentions
@@ -599,8 +606,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         self.n_head = config.n_head
         self.d_head = config.d_head
 
-        self.word_emb = AdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs,
-                                          div_val=config.div_val)
+        self.word_emb = AdaptiveEmbedding(
+            config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val
+        )
 
         self.drop = nn.Dropout(config.dropout)
 
@@ -618,27 +626,35 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
             self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
 
         self.layers = nn.ModuleList()
-        if config.attn_type == 0: # the default attention
+        if config.attn_type == 0:  # the default attention
             for i in range(config.n_layer):
                 self.layers.append(
                     RelPartialLearnableDecoderLayer(
-                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
-                        tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
-                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
+                        config.n_head,
+                        config.d_model,
+                        config.d_head,
+                        config.d_inner,
+                        config.dropout,
+                        tgt_len=config.tgt_len,
+                        ext_len=config.ext_len,
+                        mem_len=config.mem_len,
+                        dropatt=config.dropatt,
+                        pre_lnorm=config.pre_lnorm,
                         r_w_bias=None if config.untie_r else self.r_w_bias,
                         r_r_bias=None if config.untie_r else self.r_r_bias,
                         output_attentions=self.output_attentions,
-                        layer_norm_epsilon=config.layer_norm_epsilon)
+                        layer_norm_epsilon=config.layer_norm_epsilon,
+                    )
                 )
-        else: # learnable embeddings and absolute embeddings are not used in our pretrained checkpoints
+        else:  # learnable embeddings and absolute embeddings are not used in our pretrained checkpoints
             raise NotImplementedError  # Removed them to avoid maintaining dead code
 
         self.same_length = config.same_length
         self.clamp_len = config.clamp_len
 
-        if self.attn_type == 0: # default attention
+        if self.attn_type == 0:  # default attention
             self.pos_emb = PositionalEmbedding(self.d_model)
-        else: # learnable embeddings and absolute embeddings
+        else:  # learnable embeddings and absolute embeddings
             raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
 
         self.init_weights()
@@ -666,8 +682,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
             mems = []
             param = next(self.parameters())
             for i in range(self.n_layer):
-                empty = torch.zeros(self.mem_len, bsz, self.config.d_model,
-                                    dtype=param.dtype, device=param.device)
+                empty = torch.zeros(self.mem_len, bsz, self.config.d_model, dtype=param.dtype, device=param.device)
                 mems.append(empty)
 
             return mems
@@ -676,10 +691,11 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
 
     def _update_mems(self, hids, mems, qlen, mlen):
         # does not deal with None
-        if mems is None: return None
+        if mems is None:
+            return None
 
         # mems is not None
-        assert len(hids) == len(mems), 'len(hids) != len(mems)'
+        assert len(hids) == len(mems), "len(hids) != len(mems)"
 
         # There are `mlen + qlen` steps that can be cached into mems
         # For the next step, the last `ext_len` of the `qlen` tokens
@@ -725,7 +741,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
                 head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
                 head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.n_layer
 
@@ -743,17 +761,16 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
                 mask_shift_len = qlen - mask_len
             else:
                 mask_shift_len = qlen
-            dec_attn_mask = (torch.triu(all_ones, 1+mlen)
-                    + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1
+            dec_attn_mask = (torch.triu(all_ones, 1 + mlen) + torch.tril(all_ones, -mask_shift_len))[:, :, None]  # -1
         else:
-            dec_attn_mask = torch.triu(
-                word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None]
+            dec_attn_mask = torch.triu(word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1 + mlen)[
+                :, :, None
+            ]
 
         hids = []
         attentions = []
-        if self.attn_type == 0: # default
-            pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device,
-                                   dtype=word_emb.dtype)
+        if self.attn_type == 0:  # default
+            pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device, dtype=word_emb.dtype)
             if self.clamp_len > 0:
                 pos_seq.clamp_(max=self.clamp_len)
             pos_emb = self.pos_emb(pos_seq)
@@ -764,12 +781,13 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
             for i, layer in enumerate(self.layers):
                 hids.append(core_out)
                 mems_i = None if mems is None else mems[i]
-                layer_outputs = layer(core_out, pos_emb, dec_attn_mask=dec_attn_mask,
-                                      mems=mems_i, head_mask=head_mask[i])
+                layer_outputs = layer(
+                    core_out, pos_emb, dec_attn_mask=dec_attn_mask, mems=mems_i, head_mask=head_mask[i]
+                )
                 core_out = layer_outputs[0]
                 if self.output_attentions:
                     attentions.append(layer_outputs[1])
-        else: # learnable embeddings and absolute embeddings
+        else:  # learnable embeddings and absolute embeddings
             raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
 
         core_out = self.drop(core_out)
@@ -791,9 +809,12 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         return outputs  # last hidden state, new_mems, (all hidden states), (all attentions)
 
 
-@add_start_docstrings("""The Transformer-XL Model with a language modeling head on top
+@add_start_docstrings(
+    """The Transformer-XL Model with a language modeling head on top
     (adaptive softmax with weights tied to the adaptive input embeddings)""",
-    TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
+    TRANSFO_XL_START_DOCSTRING,
+    TRANSFO_XL_INPUTS_DOCSTRING,
+)
 class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -830,6 +851,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         prediction_scores, mems = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(TransfoXLLMHeadModel, self).__init__(config)
         self.transformer = TransfoXLModel(config)
@@ -840,8 +862,9 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
             self.sampler = LogUniformSampler(config.vocab_size, config.sample_softmax)
         # use adaptive softmax (including standard softmax)
         else:
-            self.crit = ProjectedAdaptiveLogSoftmax(config.vocab_size, config.d_embed, config.d_model,
-                                                    config.cutoffs, div_val=config.div_val)
+            self.crit = ProjectedAdaptiveLogSoftmax(
+                config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val
+            )
         self.init_weights()
 
     def tie_weights(self):
@@ -856,8 +879,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         else:
             if self.config.tie_weight:
                 for i in range(len(self.crit.out_layers)):
-                    self._tie_or_clone_weights(self.crit.out_layers[i],
-                                               self.transformer.word_emb.emb_layers[i])
+                    self._tie_or_clone_weights(self.crit.out_layers[i], self.transformer.word_emb.emb_layers[i])
             if self.config.tie_projs:
                 for i, tie_proj in enumerate(self.config.tie_projs):
                     if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
diff --git a/transformers/modeling_transfo_xl_utilities.py b/transformers/modeling_transfo_xl_utilities.py
index 0773d0d5fc..89451bb558 100644
--- a/transformers/modeling_transfo_xl_utilities.py
+++ b/transformers/modeling_transfo_xl_utilities.py
@@ -28,9 +28,9 @@ import torch.nn.functional as F
 # CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
 # CUDA_MINOR = int(torch.version.cuda.split('.')[1])
 
+
 class ProjectedAdaptiveLogSoftmax(nn.Module):
-    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
-                 keep_order=False):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, keep_order=False):
         super(ProjectedAdaptiveLogSoftmax, self).__init__()
 
         self.n_token = n_token
@@ -55,23 +55,19 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
         if div_val == 1:
             for i in range(len(self.cutoffs)):
                 if d_proj != d_embed:
-                    self.out_projs.append(
-                        nn.Parameter(torch.FloatTensor(d_proj, d_embed))
-                    )
+                    self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed)))
                 else:
                     self.out_projs.append(None)
 
             self.out_layers.append(nn.Linear(d_embed, n_token))
         else:
             for i in range(len(self.cutoffs)):
-                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
                 d_emb_i = d_embed // (div_val ** i)
 
-                self.out_projs.append(
-                    nn.Parameter(torch.FloatTensor(d_proj, d_emb_i))
-                )
+                self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)))
 
-                self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx))
+                self.out_layers.append(nn.Linear(d_emb_i, r_idx - l_idx))
 
         self.keep_order = keep_order
 
@@ -90,7 +86,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
         return logit
 
     def forward(self, hidden, labels=None, keep_order=False):
-        '''
+        """
             Params:
                 hidden :: [len*bsz x d_proj]
                 labels :: [len*bsz]
@@ -102,20 +98,17 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
             We could replace this implementation by the native PyTorch one
             if their's had an option to set bias on all clusters in the native one.
             here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
-        '''
+        """
 
         if labels is not None:
             labels = labels.view(-1)
             if hidden.size(0) != labels.size(0):
-                raise RuntimeError('Input and labels should have the same size '
-                                'in the batch dimension.')
+                raise RuntimeError("Input and labels should have the same size " "in the batch dimension.")
 
         if self.n_clusters == 0:
-            logit = self._compute_logit(hidden, self.out_layers[0].weight,
-                                        self.out_layers[0].bias, self.out_projs[0])
+            logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0])
             if labels is not None:
-                out = -F.log_softmax(logit, dim=-1) \
-                        .gather(1, labels.unsqueeze(1)).squeeze(1)
+                out = -F.log_softmax(logit, dim=-1).gather(1, labels.unsqueeze(1)).squeeze(1)
             else:
                 out = F.log_softmax(logit, dim=-1)
         else:
@@ -131,10 +124,8 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
                     bias_i = self.out_layers[i].bias
 
                 if i == 0:
-                    weight_i = torch.cat(
-                        [weight_i, self.cluster_weight], dim=0)
-                    bias_i = torch.cat(
-                        [bias_i, self.cluster_bias], dim=0)
+                    weight_i = torch.cat([weight_i, self.cluster_weight], dim=0)
+                    bias_i = torch.cat([bias_i, self.cluster_bias], dim=0)
 
                 weights.append(weight_i)
                 biases.append(bias_i)
@@ -171,7 +162,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
                     if labels is not None:
                         logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1)
                     else:
-                        out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
+                        out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]]
                 else:
                     weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
 
@@ -179,22 +170,22 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
                     tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
                     cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
                     if labels is not None:
-                        logprob_i = head_logprob_i[:, cluster_prob_idx] \
-                                + tail_logprob_i.gather(1, target_i[:, None]).squeeze(1)
+                        logprob_i = head_logprob_i[:, cluster_prob_idx] + tail_logprob_i.gather(
+                            1, target_i[:, None]
+                        ).squeeze(1)
                     else:
                         logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i
                         out[:, l_idx:r_idx] = logprob_i
 
                 if labels is not None:
-                    if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
+                    if (hasattr(self, "keep_order") and self.keep_order) or keep_order:
                         out.index_copy_(0, indices_i, -logprob_i)
                     else:
-                        out[offset:offset+logprob_i.size(0)].copy_(-logprob_i)
+                        out[offset : offset + logprob_i.size(0)].copy_(-logprob_i)
                     offset += logprob_i.size(0)
 
         return out
 
-
     def log_prob(self, hidden):
         r""" Computes log probabilities for all :math:`n\_classes`
         From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py
@@ -209,8 +200,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
             - Output: :math:`(N, n\_classes)`
         """
         if self.n_clusters == 0:
-            logit = self._compute_logit(hidden, self.out_layers[0].weight,
-                                        self.out_layers[0].bias, self.out_projs[0])
+            logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0])
             return F.log_softmax(logit, dim=-1)
         else:
             # construct weights and biases
@@ -225,10 +215,8 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
                     bias_i = self.out_layers[i].bias
 
                 if i == 0:
-                    weight_i = torch.cat(
-                        [weight_i, self.cluster_weight], dim=0)
-                    bias_i = torch.cat(
-                        [bias_i, self.cluster_bias], dim=0)
+                    weight_i = torch.cat([weight_i, self.cluster_weight], dim=0)
+                    bias_i = torch.cat([bias_i, self.cluster_bias], dim=0)
 
                 weights.append(weight_i)
                 biases.append(bias_i)
@@ -244,7 +232,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
                 start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1]
 
                 if i == 0:
-                    out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
+                    out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]]
                 else:
                     weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
 
@@ -270,10 +258,10 @@ class LogUniformSampler(object):
         """
         with torch.no_grad():
             self.range_max = range_max
-            log_indices = torch.arange(1., range_max+2., 1.).log_()
+            log_indices = torch.arange(1.0, range_max + 2.0, 1.0).log_()
             self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
 
-            self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
+            self.log_q = (-(-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
 
         self.n_sample = n_sample
 
@@ -298,6 +286,7 @@ class LogUniformSampler(object):
             samp_log_probs = self.log_q[neg_samples].to(device)
             return true_log_probs, samp_log_probs, neg_samples
 
+
 def sample_logits(embedding, bias, labels, inputs, sampler):
     """
         embedding: an nn.Embedding layer
@@ -313,19 +302,17 @@ def sample_logits(embedding, bias, labels, inputs, sampler):
     b1, b2 = labels.size(0), labels.size(1)
     all_ids = torch.cat([labels.view(-1), neg_samples])
     all_w = embedding(all_ids)
-    true_w = all_w[: -n_sample].view(b1, b2, -1)
-    sample_w = all_w[- n_sample:].view(n_sample, -1)
+    true_w = all_w[:-n_sample].view(b1, b2, -1)
+    sample_w = all_w[-n_sample:].view(n_sample, -1)
 
     all_b = bias[all_ids]
-    true_b = all_b[: -n_sample].view(b1, b2)
-    sample_b = all_b[- n_sample:]
+    true_b = all_b[:-n_sample].view(b1, b2)
+    sample_b = all_b[-n_sample:]
 
     hit = (labels[:, :, None] == neg_samples).detach()
 
-    true_logits = torch.einsum('ijk,ijk->ij',
-        [true_w, inputs]) + true_b - true_log_probs
-    sample_logits = torch.einsum('lk,ijk->ijl',
-        [sample_w, inputs]) + sample_b - samp_log_probs
+    true_logits = torch.einsum("ijk,ijk->ij", [true_w, inputs]) + true_b - true_log_probs
+    sample_logits = torch.einsum("lk,ijk->ijl", [sample_w, inputs]) + sample_b - samp_log_probs
     sample_logits.masked_fill_(hit, -1e30)
     logits = torch.cat([true_logits[:, :, None], sample_logits], -1)
 
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index 05e5ed3573..e934b90528 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """PyTorch BERT model."""
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import copy
 import json
@@ -31,8 +30,15 @@ from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F
 
 from .configuration_utils import PretrainedConfig
-from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, DUMMY_INPUTS,
-                         cached_path, hf_bucket_url, is_remote_url)
+from .file_utils import (
+    TF2_WEIGHTS_NAME,
+    TF_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    DUMMY_INPUTS,
+    cached_path,
+    hf_bucket_url,
+    is_remote_url,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -43,12 +49,14 @@ except ImportError:
     class Identity(nn.Module):
         r"""A placeholder identity operator that is argument-insensitive.
         """
+
         def __init__(self, *args, **kwargs):
             super(Identity, self).__init__()
 
         def forward(self, input):
             return input
 
+
 class PreTrainedModel(nn.Module):
     r""" Base class for all models.
 
@@ -78,7 +86,7 @@ class PreTrainedModel(nn.Module):
         Returns:
             torch.Tensor with dummy inputs
         """
-        return {'input_ids': torch.tensor(DUMMY_INPUTS)}
+        return {"input_ids": torch.tensor(DUMMY_INPUTS)}
 
     def __init__(self, config, *inputs, **kwargs):
         super(PreTrainedModel, self).__init__()
@@ -88,7 +96,8 @@ class PreTrainedModel(nn.Module):
                 "To create a model from a pretrained model use "
                 "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
                     self.__class__.__name__, self.__class__.__name__
-                ))
+                )
+            )
         # Save config in model
         self.config = config
 
@@ -136,14 +145,14 @@ class PreTrainedModel(nn.Module):
         else:
             output_embeddings.weight = input_embeddings.weight
 
-        if hasattr(output_embeddings, 'bias') and output_embeddings.bias is not None:
+        if hasattr(output_embeddings, "bias") and output_embeddings.bias is not None:
             output_embeddings.bias.data = torch.nn.functional.pad(
                 output_embeddings.bias.data,
                 (0, output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0]),
-                'constant',
-                0
+                "constant",
+                0,
             )
-        if hasattr(output_embeddings, 'out_features') and hasattr(input_embeddings, 'num_embeddings'):
+        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
             output_embeddings.out_features = input_embeddings.num_embeddings
 
     def resize_token_embeddings(self, new_num_tokens=None):
@@ -244,10 +253,12 @@ class PreTrainedModel(nn.Module):
         """ Save a model and its configuration file to a directory, so that it
             can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
         """
-        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
+        assert os.path.isdir(
+            save_directory
+        ), "Saving path should be a directory where the model and configuration can be saved"
 
         # Only save the model itself if we are using distributed training
-        model_to_save = self.module if hasattr(self, 'module') else self
+        model_to_save = self.module if hasattr(self, "module") else self
 
         # Save configuration file
         model_to_save.config.save_pretrained(save_directory)
@@ -329,21 +340,23 @@ class PreTrainedModel(nn.Module):
             model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        config = kwargs.pop('config', None)
-        state_dict = kwargs.pop('state_dict', None)
-        cache_dir = kwargs.pop('cache_dir', None)
-        from_tf = kwargs.pop('from_tf', False)
-        force_download = kwargs.pop('force_download', False)
-        resume_download = kwargs.pop('resume_download', False)
-        proxies = kwargs.pop('proxies', None)
-        output_loading_info = kwargs.pop('output_loading_info', False)
+        config = kwargs.pop("config", None)
+        state_dict = kwargs.pop("state_dict", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        from_tf = kwargs.pop("from_tf", False)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
 
         # Load config if we don't provide a configuration
         if not isinstance(config, PretrainedConfig):
             config_path = config if config is not None else pretrained_model_name_or_path
             config, model_kwargs = cls.config_class.from_pretrained(
-                config_path, *model_args,
-                cache_dir=cache_dir, return_unused_kwargs=True,
+                config_path,
+                *model_args,
+                cache_dir=cache_dir,
+                return_unused_kwargs=True,
                 force_download=force_download,
                 resume_download=resume_download,
                 proxies=proxies,
@@ -367,43 +380,56 @@ class PreTrainedModel(nn.Module):
                     # Load from a PyTorch checkpoint
                     archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
                 else:
-                    raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format(
-                        [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
-                        pretrained_model_name_or_path))
+                    raise EnvironmentError(
+                        "Error no file named {} found in directory {} or `from_tf` set to False".format(
+                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"], pretrained_model_name_or_path
+                        )
+                    )
             elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
             elif os.path.isfile(pretrained_model_name_or_path + ".index"):
-                assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
-                    pretrained_model_name_or_path + ".index")
+                assert (
+                    from_tf
+                ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
+                    pretrained_model_name_or_path + ".index"
+                )
                 archive_file = pretrained_model_name_or_path + ".index"
             else:
                 archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=WEIGHTS_NAME)
                 if from_tf:
-                    raise EnvironmentError("Loading a PyTorch model from a TF checkpoint is not supported when using a model identifier name.")
+                    raise EnvironmentError(
+                        "Loading a PyTorch model from a TF checkpoint is not supported when using a model identifier name."
+                    )
 
             # redirect to the cache, if necessary
             try:
-                resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download,
-                                                    proxies=proxies, resume_download=resume_download)
+                resolved_archive_file = cached_path(
+                    archive_file,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                )
             except EnvironmentError:
                 if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-                    msg = "Couldn't reach server at '{}' to download pretrained weights.".format(
-                            archive_file)
+                    msg = "Couldn't reach server at '{}' to download pretrained weights.".format(archive_file)
                 else:
-                    msg = "Model name '{}' was not found in model name list ({}). " \
-                        "We assumed '{}' was a path or url to model weight files named one of {} but " \
+                    msg = (
+                        "Model name '{}' was not found in model name list ({}). "
+                        "We assumed '{}' was a path or url to model weight files named one of {} but "
                         "couldn't find any such file at this path or url.".format(
                             pretrained_model_name_or_path,
-                            ', '.join(cls.pretrained_model_archive_map.keys()),
+                            ", ".join(cls.pretrained_model_archive_map.keys()),
                             archive_file,
-                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME])
+                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME],
+                        )
+                    )
                 raise EnvironmentError(msg)
 
             if resolved_archive_file == archive_file:
                 logger.info("loading weights file {}".format(archive_file))
             else:
-                logger.info("loading weights file {} from cache at {}".format(
-                    archive_file, resolved_archive_file))
+                logger.info("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file))
         else:
             resolved_archive_file = None
 
@@ -412,27 +438,32 @@ class PreTrainedModel(nn.Module):
 
         if state_dict is None and not from_tf:
             try:
-                state_dict = torch.load(resolved_archive_file, map_location='cpu')
+                state_dict = torch.load(resolved_archive_file, map_location="cpu")
             except:
-                raise OSError("Unable to load weights from pytorch checkpoint file. "
-                              "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. ")
+                raise OSError(
+                    "Unable to load weights from pytorch checkpoint file. "
+                    "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. "
+                )
 
         missing_keys = []
         unexpected_keys = []
         error_msgs = []
 
         if from_tf:
-            if resolved_archive_file.endswith('.index'):
+            if resolved_archive_file.endswith(".index"):
                 # Load from a TensorFlow 1.X checkpoint - provided by original authors
                 model = cls.load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
             else:
                 # Load from our TensorFlow 2.0 checkpoints
                 try:
                     from transformers import load_tf2_checkpoint_in_pytorch_model
+
                     model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True)
                 except ImportError as e:
-                    logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
-                        "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
+                    logger.error(
+                        "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
+                        "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
+                    )
                     raise e
         else:
             # Convert old format to new format if needed from a PyTorch state_dict
@@ -440,10 +471,10 @@ class PreTrainedModel(nn.Module):
             new_keys = []
             for key in state_dict.keys():
                 new_key = None
-                if 'gamma' in key:
-                    new_key = key.replace('gamma', 'weight')
-                if 'beta' in key:
-                    new_key = key.replace('beta', 'bias')
+                if "gamma" in key:
+                    new_key = key.replace("gamma", "weight")
+                if "beta" in key:
+                    new_key = key.replace("beta", "bias")
                 if new_key:
                     old_keys.append(key)
                     new_keys.append(new_key)
@@ -451,39 +482,53 @@ class PreTrainedModel(nn.Module):
                 state_dict[new_key] = state_dict.pop(old_key)
 
             # copy state_dict so _load_from_state_dict can modify it
-            metadata = getattr(state_dict, '_metadata', None)
+            metadata = getattr(state_dict, "_metadata", None)
             state_dict = state_dict.copy()
             if metadata is not None:
                 state_dict._metadata = metadata
 
             # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
             # so we need to apply the function recursively.
-            def load(module, prefix=''):
+            def load(module, prefix=""):
                 local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
                 module._load_from_state_dict(
-                    state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+                    state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
+                )
                 for name, child in module._modules.items():
                     if child is not None:
-                        load(child, prefix + name + '.')
+                        load(child, prefix + name + ".")
 
             # Make sure we are able to load base models as well as derived models (with heads)
-            start_prefix = ''
+            start_prefix = ""
             model_to_load = model
-            if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
-                start_prefix = cls.base_model_prefix + '.'
-            if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
+            if not hasattr(model, cls.base_model_prefix) and any(
+                s.startswith(cls.base_model_prefix) for s in state_dict.keys()
+            ):
+                start_prefix = cls.base_model_prefix + "."
+            if hasattr(model, cls.base_model_prefix) and not any(
+                s.startswith(cls.base_model_prefix) for s in state_dict.keys()
+            ):
                 model_to_load = getattr(model, cls.base_model_prefix)
 
             load(model_to_load, prefix=start_prefix)
             if len(missing_keys) > 0:
-                logger.info("Weights of {} not initialized from pretrained model: {}".format(
-                    model.__class__.__name__, missing_keys))
+                logger.info(
+                    "Weights of {} not initialized from pretrained model: {}".format(
+                        model.__class__.__name__, missing_keys
+                    )
+                )
             if len(unexpected_keys) > 0:
-                logger.info("Weights from pretrained model not used in {}: {}".format(
-                    model.__class__.__name__, unexpected_keys))
+                logger.info(
+                    "Weights from pretrained model not used in {}: {}".format(
+                        model.__class__.__name__, unexpected_keys
+                    )
+                )
             if len(error_msgs) > 0:
-                raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                                model.__class__.__name__, "\n\t".join(error_msgs)))
+                raise RuntimeError(
+                    "Error(s) in loading state_dict for {}:\n\t{}".format(
+                        model.__class__.__name__, "\n\t".join(error_msgs)
+                    )
+                )
 
         model.tie_weights()  # make sure word embedding weights are still tied if needed
 
@@ -500,10 +545,22 @@ class PreTrainedModel(nn.Module):
         return {"input_ids": input_ids}
 
     @torch.no_grad()
-    def generate(self, input_ids=None, max_length=None, do_sample=None, num_beams=None,
-                 temperature=None, top_k=None, top_p=None, repetition_penalty=None,
-                 bos_token_id=None, pad_token_id=None, eos_token_ids=None,
-                 length_penalty=None, num_return_sequences=None):
+    def generate(
+        self,
+        input_ids=None,
+        max_length=None,
+        do_sample=None,
+        num_beams=None,
+        temperature=None,
+        top_k=None,
+        top_p=None,
+        repetition_penalty=None,
+        bos_token_id=None,
+        pad_token_id=None,
+        eos_token_ids=None,
+        length_penalty=None,
+        num_return_sequences=None,
+    ):
         """ Sequence generator for models with a LM head.
 
         The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling
@@ -543,8 +600,10 @@ class PreTrainedModel(nn.Module):
 
         # We cannot generate if the model does not have a LM head
         if self.get_output_embeddings() is None:
-            raise AttributeError("You tried to generate sequences with a model that does not have a LM Head."
-                                 "Please use another model class (e.g. `OpenAIGPTLMHeadModel`)")
+            raise AttributeError(
+                "You tried to generate sequences with a model that does not have a LM Head."
+                "Please use another model class (e.g. `OpenAIGPTLMHeadModel`)"
+            )
 
         max_length = max_length if max_length is not None else self.config.max_length
         do_sample = do_sample if do_sample is not None else self.config.do_sample
@@ -557,7 +616,9 @@ class PreTrainedModel(nn.Module):
         pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
         eos_token_ids = eos_token_ids if eos_token_ids is not None else self.config.eos_token_ids
         length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
-        num_return_sequences = num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
+        num_return_sequences = (
+            num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
+        )
 
         if input_ids is not None:
             batch_size = input_ids.shape[0]  # overriden by the input batch_size
@@ -575,13 +636,18 @@ class PreTrainedModel(nn.Module):
         assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
         assert isinstance(bos_token_id, int) and bos_token_id >= 0, "`bos_token_id` should be a positive integer."
         assert isinstance(pad_token_id, int) and pad_token_id >= 0, "`pad_token_id` should be a positive integer."
-        assert isinstance(eos_token_ids, (list, tuple)) and (e >= 0 for e in eos_token_ids), \
-                   "`eos_token_ids` should be a positive integer or a list/tuple of positive integers."
+        assert isinstance(eos_token_ids, (list, tuple)) and (
+            e >= 0 for e in eos_token_ids
+        ), "`eos_token_ids` should be a positive integer or a list/tuple of positive integers."
         assert length_penalty > 0, "`length_penalty` should be strictely positive."
-        assert isinstance(num_return_sequences, int) and num_return_sequences > 0, "`num_return_sequences` should be a strictely positive integer."
+        assert (
+            isinstance(num_return_sequences, int) and num_return_sequences > 0
+        ), "`num_return_sequences` should be a strictely positive integer."
 
         if input_ids is None:
-            input_ids = torch.full((batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device)
+            input_ids = torch.full(
+                (batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device
+            )
         else:
             assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)."
 
@@ -592,28 +658,63 @@ class PreTrainedModel(nn.Module):
         if num_return_sequences != 1:
             # Expand input to num return sequences
             input_ids = input_ids.unsqueeze(1).expand(batch_size, num_return_sequences, cur_len)
-            input_ids = input_ids.contiguous().view(batch_size * num_return_sequences, cur_len)   # (batch_size * num_return_sequences, cur_len)
+            input_ids = input_ids.contiguous().view(
+                batch_size * num_return_sequences, cur_len
+            )  # (batch_size * num_return_sequences, cur_len)
             effective_batch_size = batch_size * num_return_sequences
         else:
             effective_batch_size = batch_size
 
         if num_beams > 1:
-            output = self._generate_beam_search(input_ids, cur_len, max_length, do_sample,
-                                                temperature, top_k, top_p, repetition_penalty,
-                                                pad_token_id, eos_token_ids, effective_batch_size,
-                                                length_penalty, num_beams, vocab_size)
+            output = self._generate_beam_search(
+                input_ids,
+                cur_len,
+                max_length,
+                do_sample,
+                temperature,
+                top_k,
+                top_p,
+                repetition_penalty,
+                pad_token_id,
+                eos_token_ids,
+                effective_batch_size,
+                length_penalty,
+                num_beams,
+                vocab_size,
+            )
         else:
-            output = self._generate_no_beam_search(input_ids, cur_len, max_length, do_sample,
-                                             temperature, top_k, top_p, repetition_penalty,
-                                             pad_token_id, eos_token_ids, effective_batch_size)
+            output = self._generate_no_beam_search(
+                input_ids,
+                cur_len,
+                max_length,
+                do_sample,
+                temperature,
+                top_k,
+                top_p,
+                repetition_penalty,
+                pad_token_id,
+                eos_token_ids,
+                effective_batch_size,
+            )
 
         if num_return_sequences != 1:
             output = output.view(batch_size, num_return_sequences, -1)
         return output
 
-    def _generate_no_beam_search(self, input_ids, cur_len, max_length, do_sample,
-                                 temperature, top_k, top_p, repetition_penalty,
-                                 pad_token_id, eos_token_ids, batch_size):
+    def _generate_no_beam_search(
+        self,
+        input_ids,
+        cur_len,
+        max_length,
+        do_sample,
+        temperature,
+        top_k,
+        top_p,
+        repetition_penalty,
+        pad_token_id,
+        eos_token_ids,
+        batch_size,
+    ):
         """ Generate sequences for each example without beam search (num_beams == 1).
             All returned sequence are generated independantly.
         """
@@ -663,23 +764,38 @@ class PreTrainedModel(nn.Module):
 
         return input_ids
 
-    def _generate_beam_search(self, input_ids, cur_len, max_length, do_sample,
-                              temperature, top_k, top_p, repetition_penalty,
-                              pad_token_id, eos_token_ids, batch_size,
-                              length_penalty, num_beams, vocab_size):
+    def _generate_beam_search(
+        self,
+        input_ids,
+        cur_len,
+        max_length,
+        do_sample,
+        temperature,
+        top_k,
+        top_p,
+        repetition_penalty,
+        pad_token_id,
+        eos_token_ids,
+        batch_size,
+        length_penalty,
+        num_beams,
+        vocab_size,
+    ):
         """ Generate sequences for each example with beam search.
         """
         # Expand input to num beams
         input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams, cur_len)
-        input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len)   # (batch_size * num_beams, cur_len)
+        input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len)  # (batch_size * num_beams, cur_len)
 
         # generated hypotheses
-        generated_hyps = [BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=False) for _ in range(batch_size)]
+        generated_hyps = [
+            BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=False) for _ in range(batch_size)
+        ]
 
         # scores for each sentence in the beam
         beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
         beam_scores[:, 1:] = -1e9
-        beam_scores = beam_scores.view(-1)                                      # shape (batch_size * num_beams,)
+        beam_scores = beam_scores.view(-1)  # shape (batch_size * num_beams,)
 
         # cache compute states
         pasts = None  # self.prepare_pasts()
@@ -689,8 +805,8 @@ class PreTrainedModel(nn.Module):
 
         while cur_len < max_length:
             model_inputs = self.prepare_inputs_for_generation(input_ids, pasts=pasts)
-            scores = self(**model_inputs)[0]                                    # (batch_size * num_beams, cur_len, vocab_size)
-            scores = scores[:, -1, :]                                           # (batch_size * num_beams, vocab_size)
+            scores = self(**model_inputs)[0]  # (batch_size * num_beams, cur_len, vocab_size)
+            scores = scores[:, -1, :]  # (batch_size * num_beams, vocab_size)
 
             # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
             if repetition_penalty != 1.0:
@@ -703,25 +819,27 @@ class PreTrainedModel(nn.Module):
                 if temperature > 0 and temperature != 1.0:
                     scores = scores / temperature
                 # Top-p/top-k filtering
-                scores = top_k_top_p_filtering(scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2)  # (batch_size * num_beams, vocab_size)
+                scores = top_k_top_p_filtering(
+                    scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2
+                )  # (batch_size * num_beams, vocab_size)
                 # Sample 2 next words for each beam (so we have some spare tokens and match output of greedy beam search)
-                next_words = torch.multinomial(F.softmax(scores, dim=-1), num_samples=2)    # (batch_size * num_beams, 2)
+                next_words = torch.multinomial(F.softmax(scores, dim=-1), num_samples=2)  # (batch_size * num_beams, 2)
                 # Compute next scores
-                _scores = F.log_softmax(scores, dim=-1)                                     # (batch_size * num_beams, vocab_size)
-                _scores = torch.gather(_scores, -1, next_words)                             # (batch_size * num_beams, 2)
-                next_scores = _scores + beam_scores[:, None].expand_as(_scores)             # (batch_size * num_beams, 2)
+                _scores = F.log_softmax(scores, dim=-1)  # (batch_size * num_beams, vocab_size)
+                _scores = torch.gather(_scores, -1, next_words)  # (batch_size * num_beams, 2)
+                next_scores = _scores + beam_scores[:, None].expand_as(_scores)  # (batch_size * num_beams, 2)
                 # Match shape of greedy beam search
-                next_words = next_words.view(batch_size, 2 * num_beams)                     # (batch_size, 2 * num_beams)
-                next_scores = next_scores.view(batch_size, 2 * num_beams)                   # (batch_size, 2 * num_beams)
+                next_words = next_words.view(batch_size, 2 * num_beams)  # (batch_size, 2 * num_beams)
+                next_scores = next_scores.view(batch_size, 2 * num_beams)  # (batch_size, 2 * num_beams)
             else:
                 # do greedy beam search
-                scores = F.log_softmax(scores, dim=-1)                          # (batch_size * num_beams, vocab_size)
+                scores = F.log_softmax(scores, dim=-1)  # (batch_size * num_beams, vocab_size)
                 assert scores.size() == (batch_size * num_beams, vocab_size)
                 # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product)
-                _scores = scores + beam_scores[:, None].expand_as(scores)       # (batch_size * num_beams, vocab_size)
+                _scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
                 # re-organize to group the beam together (we are keeping top hypothesis accross beams)
-                _scores = _scores.view(batch_size, num_beams * vocab_size)      # (batch_size, num_beams * vocab_size)
-                next_scores, next_words = torch.topk(_scores, 2*num_beams, dim=1, largest=True, sorted=True)
+                _scores = _scores.view(batch_size, num_beams * vocab_size)  # (batch_size, num_beams * vocab_size)
+                next_scores, next_words = torch.topk(_scores, 2 * num_beams, dim=1, largest=True, sorted=True)
 
             assert next_scores.size() == next_words.size() == (batch_size, 2 * num_beams)
 
@@ -750,7 +868,9 @@ class PreTrainedModel(nn.Module):
 
                     # end of sentence, or next word
                     if word_id.item() in eos_token_ids or cur_len + 1 == max_length:
-                        generated_hyps[batch_ex].add(input_ids[batch_ex * num_beams + beam_id, :cur_len].clone(), score.item())
+                        generated_hyps[batch_ex].add(
+                            input_ids[batch_ex * num_beams + beam_id, :cur_len].clone(), score.item()
+                        )
                     else:
                         next_sent_beam.append((score, word_id, batch_ex * num_beams + beam_id))
 
@@ -807,13 +927,13 @@ class PreTrainedModel(nn.Module):
         # generate target batch
         decoded = input_ids.new(batch_size, tgt_len.max().item()).fill_(pad_token_id)
         for i, hypo in enumerate(best):
-            decoded[i, :tgt_len[i] - 1] = hypo
+            decoded[i, : tgt_len[i] - 1] = hypo
             decoded[i, tgt_len[i] - 1] = eos_token_ids[0]
 
         return decoded
 
 
-def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float('Inf'), min_tokens_to_keep=1):
+def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
     """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
         Args:
             logits: logits distribution shape (batch size, vocabulary size)
@@ -849,7 +969,6 @@ def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float('Inf')
 
 
 class BeamHypotheses(object):
-
     def __init__(self, n_hyp, max_length, length_penalty, early_stopping):
         """
         Initialize n-best list of hypotheses.
@@ -915,6 +1034,7 @@ class Conv1D(nn.Module):
 
 class PoolerStartLogits(nn.Module):
     """ Compute SQuAD start_logits from sequence hidden states. """
+
     def __init__(self, config):
         super(PoolerStartLogits, self).__init__()
         self.dense = nn.Linear(config.hidden_size, 1)
@@ -939,6 +1059,7 @@ class PoolerStartLogits(nn.Module):
 class PoolerEndLogits(nn.Module):
     """ Compute SQuAD end_logits from sequence hidden states and start token hidden state.
     """
+
     def __init__(self, config):
         super(PoolerEndLogits, self).__init__()
         self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
@@ -959,12 +1080,14 @@ class PoolerEndLogits(nn.Module):
                 Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
                 1.0 means token should be masked.
         """
-        assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
+        assert (
+            start_states is not None or start_positions is not None
+        ), "One of start_states, start_positions should be not None"
         if start_positions is not None:
             slen, hsz = hidden_states.shape[-2:]
-            start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
-            start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz)
-            start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz)
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions)  # shape (bsz, 1, hsz)
+            start_states = start_states.expand(-1, slen, -1)  # shape (bsz, slen, hsz)
 
         x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
         x = self.activation(x)
@@ -982,6 +1105,7 @@ class PoolerEndLogits(nn.Module):
 
 class PoolerAnswerClass(nn.Module):
     """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """
+
     def __init__(self, config):
         super(PoolerAnswerClass, self).__init__()
         self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
@@ -1006,16 +1130,18 @@ class PoolerAnswerClass(nn.Module):
                 for each sample
         """
         hsz = hidden_states.shape[-1]
-        assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
+        assert (
+            start_states is not None or start_positions is not None
+        ), "One of start_states, start_positions should be not None"
         if start_positions is not None:
-            start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
-            start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz)
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions).squeeze(-2)  # shape (bsz, hsz)
 
         if cls_index is not None:
-            cls_index = cls_index[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
-            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, hsz)
+            cls_index = cls_index[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, hsz)
         else:
-            cls_token_state = hidden_states[:, -1, :] # shape (bsz, hsz)
+            cls_token_state = hidden_states[:, -1, :]  # shape (bsz, hsz)
 
         x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
         x = self.activation(x)
@@ -1064,6 +1190,7 @@ class SQuADHead(nn.Module):
             ``torch.FloatTensor`` of shape ``(batch_size,)``
             Log probabilities for the ``is_impossible`` label of the answers.
     """
+
     def __init__(self, config):
         super(SQuADHead, self).__init__()
         self.start_n_top = config.start_n_top
@@ -1073,8 +1200,9 @@ class SQuADHead(nn.Module):
         self.end_logits = PoolerEndLogits(config)
         self.answer_class = PoolerAnswerClass(config)
 
-    def forward(self, hidden_states, start_positions=None, end_positions=None,
-                cls_index=None, is_impossible=None, p_mask=None):
+    def forward(
+        self, hidden_states, start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None
+    ):
         outputs = ()
 
         start_logits = self.start_logits(hidden_states, p_mask=p_mask)
@@ -1107,19 +1235,25 @@ class SQuADHead(nn.Module):
         else:
             # during inference, compute the end logits based on beam search
             bsz, slen, hsz = hidden_states.size()
-            start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen)
+            start_log_probs = F.softmax(start_logits, dim=-1)  # shape (bsz, slen)
 
-            start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top)
-            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
-            start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz)
-            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz)
+            start_top_log_probs, start_top_index = torch.topk(
+                start_log_probs, self.start_n_top, dim=-1
+            )  # shape (bsz, start_n_top)
+            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz)  # shape (bsz, start_n_top, hsz)
+            start_states = torch.gather(hidden_states, -2, start_top_index_exp)  # shape (bsz, start_n_top, hsz)
+            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)
 
-            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz)
+            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
+                start_states
+            )  # shape (bsz, slen, start_n_top, hsz)
             p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
             end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
-            end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top)
+            end_log_probs = F.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
 
-            end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top)
+            end_top_log_probs, end_top_index = torch.topk(
+                end_log_probs, self.end_n_top, dim=1
+            )  # shape (bsz, end_n_top, start_n_top)
             end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
             end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
 
@@ -1148,34 +1282,35 @@ class SequenceSummary(nn.Module):
             summary_first_dropout: Add a dropout before the projection and activation
             summary_last_dropout: Add a dropout after the projection and activation
     """
+
     def __init__(self, config):
         super(SequenceSummary, self).__init__()
 
-        self.summary_type = config.summary_type if hasattr(config, 'summary_type') else 'last'
-        if self.summary_type == 'attn':
+        self.summary_type = config.summary_type if hasattr(config, "summary_type") else "last"
+        if self.summary_type == "attn":
             # We should use a standard multi-head attention module with absolute positional embedding for that.
             # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
             # We can probably just use the multi-head attention module of PyTorch >=1.1.0
             raise NotImplementedError
 
         self.summary = Identity()
-        if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
-            if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
+        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
+            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
                 num_classes = config.num_labels
             else:
                 num_classes = config.hidden_size
             self.summary = nn.Linear(config.hidden_size, num_classes)
 
         self.activation = Identity()
-        if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
+        if hasattr(config, "summary_activation") and config.summary_activation == "tanh":
             self.activation = nn.Tanh()
 
         self.first_dropout = Identity()
-        if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
+        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
             self.first_dropout = nn.Dropout(config.summary_first_dropout)
 
         self.last_dropout = Identity()
-        if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
+        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
             self.last_dropout = nn.Dropout(config.summary_last_dropout)
 
     def forward(self, hidden_states, cls_index=None):
@@ -1185,21 +1320,21 @@ class SequenceSummary(nn.Module):
                 if summary_type == 'cls_index' and cls_index is None:
                     we take the last token of the sequence as classification token
         """
-        if self.summary_type == 'last':
+        if self.summary_type == "last":
             output = hidden_states[:, -1]
-        elif self.summary_type == 'first':
+        elif self.summary_type == "first":
             output = hidden_states[:, 0]
-        elif self.summary_type == 'mean':
+        elif self.summary_type == "mean":
             output = hidden_states.mean(dim=1)
-        elif self.summary_type == 'cls_index':
+        elif self.summary_type == "cls_index":
             if cls_index is None:
-                cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long)
+                cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2] - 1, dtype=torch.long)
             else:
                 cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
-                cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
+                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
             # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
-            output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)
-        elif self.summary_type == 'attn':
+            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
+        elif self.summary_type == "attn":
             raise NotImplementedError
 
         output = self.first_dropout(output)
diff --git a/transformers/modeling_xlm.py b/transformers/modeling_xlm.py
index 5135f1e884..cd758a0433 100644
--- a/transformers/modeling_xlm.py
+++ b/transformers/modeling_xlm.py
@@ -34,24 +34,21 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-pytorch_model.bin",
-    'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-pytorch_model.bin",
-    'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-pytorch_model.bin",
-    'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-pytorch_model.bin",
-    'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-pytorch_model.bin",
-    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin",
-    'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin",
-    'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin",
-    'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin",
-    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-pytorch_model.bin",
+    "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-pytorch_model.bin",
+    "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-pytorch_model.bin",
+    "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-pytorch_model.bin",
+    "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-pytorch_model.bin",
+    "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-pytorch_model.bin",
+    "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin",
+    "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin",
+    "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin",
+    "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin",
+    "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-pytorch_model.bin",
 }
 
 
 def create_sinusoidal_embeddings(n_pos, dim, out):
-    position_enc = np.array([
-        [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
-        for pos in range(n_pos)
-    ])
+    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
     out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
     out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
     out.detach_()
@@ -142,7 +139,7 @@ class MultiHeadAttention(nn.Module):
         # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
         bs, qlen, dim = input.size()
         if kv is None:
-            klen = qlen if cache is None else cache['slen'] + qlen
+            klen = qlen if cache is None else cache["slen"] + qlen
         else:
             klen = kv.size(1)
         # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
@@ -158,39 +155,39 @@ class MultiHeadAttention(nn.Module):
             """  compute context """
             return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
 
-        q = shape(self.q_lin(input))                                          # (bs, n_heads, qlen, dim_per_head)
+        q = shape(self.q_lin(input))  # (bs, n_heads, qlen, dim_per_head)
         if kv is None:
-            k = shape(self.k_lin(input))                                      # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v_lin(input))                                      # (bs, n_heads, qlen, dim_per_head)
+            k = shape(self.k_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(input))  # (bs, n_heads, qlen, dim_per_head)
         elif cache is None or self.layer_id not in cache:
             k = v = kv
-            k = shape(self.k_lin(k))                                          # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v_lin(v))                                          # (bs, n_heads, qlen, dim_per_head)
+            k = shape(self.k_lin(k))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(v))  # (bs, n_heads, qlen, dim_per_head)
 
         if cache is not None:
             if self.layer_id in cache:
                 if kv is None:
                     k_, v_ = cache[self.layer_id]
-                    k = torch.cat([k_, k], dim=2)                             # (bs, n_heads, klen, dim_per_head)
-                    v = torch.cat([v_, v], dim=2)                             # (bs, n_heads, klen, dim_per_head)
+                    k = torch.cat([k_, k], dim=2)  # (bs, n_heads, klen, dim_per_head)
+                    v = torch.cat([v_, v], dim=2)  # (bs, n_heads, klen, dim_per_head)
                 else:
                     k, v = cache[self.layer_id]
             cache[self.layer_id] = (k, v)
 
-        q = q / math.sqrt(dim_per_head)                                       # (bs, n_heads, qlen, dim_per_head)
-        scores = torch.matmul(q, k.transpose(2, 3))                           # (bs, n_heads, qlen, klen)
-        mask = (mask == 0).view(mask_reshape).expand_as(scores)               # (bs, n_heads, qlen, klen)
-        scores.masked_fill_(mask, -float('inf'))                              # (bs, n_heads, qlen, klen)
+        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, qlen, dim_per_head)
+        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, qlen, klen)
+        mask = (mask == 0).view(mask_reshape).expand_as(scores)  # (bs, n_heads, qlen, klen)
+        scores.masked_fill_(mask, -float("inf"))  # (bs, n_heads, qlen, klen)
 
-        weights = F.softmax(scores.float(), dim=-1).type_as(scores)           # (bs, n_heads, qlen, klen)
+        weights = F.softmax(scores.float(), dim=-1).type_as(scores)  # (bs, n_heads, qlen, klen)
         weights = F.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
 
         # Mask heads if we want to
         if head_mask is not None:
             weights = weights * head_mask
 
-        context = torch.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
-        context = unshape(context)                                            # (bs, qlen, dim)
+        context = torch.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)  # (bs, qlen, dim)
 
         outputs = (self.out_lin(context),)
         if self.output_attentions:
@@ -199,7 +196,6 @@ class MultiHeadAttention(nn.Module):
 
 
 class TransformerFFN(nn.Module):
-
     def __init__(self, in_dim, dim_hidden, out_dim, config):
         super(TransformerFFN, self).__init__()
         self.dropout = config.dropout
@@ -219,6 +215,7 @@ class XLMPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = XLMConfig
     pretrained_model_archive_map = XLM_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = None
@@ -235,7 +232,7 @@ class XLMPreTrainedModel(PreTrainedModel):
             langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
         else:
             langs_list = None
-        return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list}
+        return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list}
 
     def _init_weights(self, module):
         """ Initialize the weights. """
@@ -245,8 +242,8 @@ class XLMPreTrainedModel(PreTrainedModel):
         if isinstance(module, nn.Linear):
             if self.config is not None and self.config.init_std is not None:
                 nn.init.normal_(module.weight, mean=0, std=self.config.init_std)
-                if hasattr(module, 'bias') and module.bias is not None:
-                    nn.init.constant_(module.bias, 0.)
+                if hasattr(module, "bias") and module.bias is not None:
+                    nn.init.constant_(module.bias, 0.0)
         if isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
@@ -327,8 +324,12 @@ XLM_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
-                      XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
+    XLM_START_DOCSTRING,
+    XLM_INPUTS_DOCSTRING,
+)
 class XLMModel(XLMPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -351,7 +352,8 @@ class XLMModel(XLMPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
-    def __init__(self, config):  #, dico, is_encoder, with_output):
+
+    def __init__(self, config):  # , dico, is_encoder, with_output):
         super(XLMModel, self).__init__(config)
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
@@ -377,13 +379,13 @@ class XLMModel(XLMPreTrainedModel):
         # assert len(self.id2lang) == len(self.lang2id) == self.n_langs
 
         # model parameters
-        self.dim = config.emb_dim       # 512 by default
+        self.dim = config.emb_dim  # 512 by default
         self.hidden_dim = self.dim * 4  # 2048 by default
-        self.n_heads = config.n_heads   # 8 by default
+        self.n_heads = config.n_heads  # 8 by default
         self.n_layers = config.n_layers
         self.dropout = config.dropout
         self.attention_dropout = config.attention_dropout
-        assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads'
+        assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads"
 
         # embeddings
         self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim)
@@ -435,8 +437,18 @@ class XLMModel(XLMPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.attentions[layer].prune_heads(heads)
 
-    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None, inputs_embeds=None):  # removed: src_enc=None, src_len=None
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        langs=None,
+        token_type_ids=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):  # removed: src_enc=None, src_len=None
         if input_ids is not None:
             bs, slen = input_ids.size()
         else:
@@ -446,7 +458,7 @@ class XLMModel(XLMPreTrainedModel):
             if input_ids is not None:
                 lengths = (input_ids != self.pad_index).sum(dim=1).long()
             else:
-                lengths = torch.LongTensor([slen]*bs)
+                lengths = torch.LongTensor([slen] * bs)
         # mask = input_ids != self.pad_index
 
         # check inputs
@@ -488,14 +500,18 @@ class XLMModel(XLMPreTrainedModel):
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.n_layers, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.n_layers
 
         # do not recompute cached elements
         if cache is not None and input_ids is not None:
-            _slen = slen - cache['slen']
+            _slen = slen - cache["slen"]
             input_ids = input_ids[:, -_slen:]
             position_ids = position_ids[:, -_slen:]
             if langs is not None:
@@ -550,7 +566,7 @@ class XLMModel(XLMPreTrainedModel):
 
         # update cache length
         if cache is not None:
-            cache['slen'] += tensor.size(1)
+            cache["slen"] += tensor.size(1)
 
         # move back sequence length to dimension 0
         # tensor = tensor.transpose(0, 1)
@@ -567,6 +583,7 @@ class XLMPredLayer(nn.Module):
     """
     Prediction layer (cross_entropy or adaptive_softmax).
     """
+
     def __init__(self, config):
         super(XLMPredLayer, self).__init__()
         self.asm = config.asm
@@ -593,7 +610,7 @@ class XLMPredLayer(nn.Module):
             scores = self.proj(x)
             outputs = (scores,) + outputs
             if y is not None:
-                loss = F.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction='elementwise_mean')
+                loss = F.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction="elementwise_mean")
                 outputs = (loss,) + outputs
         else:
             scores = self.proj.log_prob(x)
@@ -605,9 +622,12 @@ class XLMPredLayer(nn.Module):
         return outputs
 
 
-@add_start_docstrings("""The XLM Model transformer with a language modeling head on top
+@add_start_docstrings(
+    """The XLM Model transformer with a language modeling head on top
     (linear layer with weights tied to the input embeddings). """,
-    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+    XLM_START_DOCSTRING,
+    XLM_INPUTS_DOCSTRING,
+)
 class XLMWithLMHeadModel(XLMPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -639,6 +659,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config):
         super(XLMWithLMHeadModel, self).__init__(config)
         self.transformer = XLMModel(config)
@@ -661,17 +682,30 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
             langs = None
         return {"input_ids": input_ids, "langs": langs}
 
-    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               attention_mask=attention_mask,
-                                               langs=langs,
-                                               token_type_ids=token_type_ids,
-                                               position_ids=position_ids,
-                                               lengths=lengths,
-                                               cache=cache,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        langs=None,
+        token_type_ids=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         output = transformer_outputs[0]
         outputs = self.pred_layer(output, labels)
@@ -680,9 +714,12 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
         return outputs
 
 
-@add_start_docstrings("""XLM Model with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """XLM Model with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+    XLM_START_DOCSTRING,
+    XLM_INPUTS_DOCSTRING,
+)
 class XLMForSequenceClassification(XLMPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -714,6 +751,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XLMForSequenceClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -723,17 +761,30 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               attention_mask=attention_mask,
-                                               langs=langs,
-                                               token_type_ids=token_type_ids,
-                                               position_ids=position_ids,
-                                               lengths=lengths, 
-                                               cache=cache,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        langs=None,
+        token_type_ids=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         output = transformer_outputs[0]
         logits = self.sequence_summary(output)
@@ -753,9 +804,12 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
         return outputs
 
 
-@add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+    XLM_START_DOCSTRING,
+    XLM_INPUTS_DOCSTRING,
+)
 class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -799,6 +853,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
         loss, start_scores, end_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XLMForQuestionAnsweringSimple, self).__init__(config)
 
@@ -807,17 +862,31 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               attention_mask=attention_mask,
-                                               langs=langs,
-                                               token_type_ids=token_type_ids,
-                                               position_ids=position_ids,
-                                               lengths=lengths, 
-                                               cache=cache,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        langs=None,
+        token_type_ids=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output = transformer_outputs[0]
 
@@ -826,7 +895,10 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
-        outputs = (start_logits, end_logits,)
+        outputs = (
+            start_logits,
+            end_logits,
+        )
         if start_positions is not None and end_positions is not None:
             # If we are on multi-GPU, split add a dimension
             if len(start_positions.size()) > 1:
@@ -849,9 +921,12 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
         return outputs
 
 
-@add_start_docstrings("""XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+    XLM_START_DOCSTRING,
+    XLM_INPUTS_DOCSTRING,
+)
 class XLMForQuestionAnswering(XLMPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -895,6 +970,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
         loss, start_scores, end_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XLMForQuestionAnswering, self).__init__(config)
 
@@ -903,23 +979,45 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None,
-                is_impossible=None, cls_index=None, p_mask=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               attention_mask=attention_mask,
-                                               langs=langs,
-                                               token_type_ids=token_type_ids,
-                                               position_ids=position_ids,
-                                               lengths=lengths, 
-                                               cache=cache,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        langs=None,
+        token_type_ids=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        is_impossible=None,
+        cls_index=None,
+        p_mask=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         output = transformer_outputs[0]
 
-        outputs = self.qa_outputs(output, start_positions=start_positions, end_positions=end_positions,
-                                  cls_index=cls_index, is_impossible=is_impossible, p_mask=p_mask)
+        outputs = self.qa_outputs(
+            output,
+            start_positions=start_positions,
+            end_positions=end_positions,
+            cls_index=cls_index,
+            is_impossible=is_impossible,
+            p_mask=p_mask,
+        )
 
         outputs = outputs + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
 
diff --git a/transformers/modeling_xlm_roberta.py b/transformers/modeling_xlm_roberta.py
index 0bdce941a5..8f1ed6ec67 100644
--- a/transformers/modeling_xlm_roberta.py
+++ b/transformers/modeling_xlm_roberta.py
@@ -15,24 +15,29 @@
 # limitations under the License.
 """PyTorch XLM-RoBERTa model. """
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 
-from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForMultipleChoice, RobertaForTokenClassification
+from .modeling_roberta import (
+    RobertaModel,
+    RobertaForMaskedLM,
+    RobertaForSequenceClassification,
+    RobertaForMultipleChoice,
+    RobertaForTokenClassification,
+)
 from .configuration_xlm_roberta import XLMRobertaConfig
 from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
 XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-pytorch_model.bin",
-    'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-pytorch_model.bin",
-    'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-pytorch_model.bin",
-    'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-pytorch_model.bin",
-    'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-pytorch_model.bin",
-    'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-pytorch_model.bin",
+    "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-pytorch_model.bin",
+    "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-pytorch_model.bin",
+    "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-pytorch_model.bin",
+    "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-pytorch_model.bin",
+    "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-pytorch_model.bin",
+    "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-pytorch_model.bin",
 }
 
 
@@ -105,8 +110,12 @@ XLM_ROBERTA_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
-                      XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+    XLM_ROBERTA_START_DOCSTRING,
+    XLM_ROBERTA_INPUTS_DOCSTRING,
+)
 class XLMRobertaModel(RobertaModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -154,8 +163,11 @@ class XLMRobertaModel(RobertaModel):
     pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
-@add_start_docstrings("""XLM-RoBERTa Model with a `language modeling` head on top. """,
-    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """XLM-RoBERTa Model with a `language modeling` head on top. """,
+    XLM_ROBERTA_START_DOCSTRING,
+    XLM_ROBERTA_INPUTS_DOCSTRING,
+)
 class XLMRobertaForMaskedLM(RobertaForMaskedLM):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -190,9 +202,12 @@ class XLMRobertaForMaskedLM(RobertaForMaskedLM):
     pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
-@add_start_docstrings("""XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
+@add_start_docstrings(
+    """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
     on top of the pooled output) e.g. for GLUE tasks. """,
-    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+    XLM_ROBERTA_START_DOCSTRING,
+    XLM_ROBERTA_INPUTS_DOCSTRING,
+)
 class XLMRobertaForSequenceClassification(RobertaForSequenceClassification):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -228,9 +243,12 @@ class XLMRobertaForSequenceClassification(RobertaForSequenceClassification):
     pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
-@add_start_docstrings("""XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of
     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+    XLM_ROBERTA_START_DOCSTRING,
+    XLM_ROBERTA_INPUTS_DOCSTRING,
+)
 class XLMRobertaForMultipleChoice(RobertaForMultipleChoice):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -262,9 +280,12 @@ class XLMRobertaForMultipleChoice(RobertaForMultipleChoice):
     pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
-@add_start_docstrings("""XLM-RoBERTa Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """XLM-RoBERTa Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+    XLM_ROBERTA_START_DOCSTRING,
+    XLM_ROBERTA_INPUTS_DOCSTRING,
+)
 class XLMRobertaForTokenClassification(RobertaForTokenClassification):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
diff --git a/transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py
index 3109fd8cdf..2a210502d9 100644
--- a/transformers/modeling_xlnet.py
+++ b/transformers/modeling_xlnet.py
@@ -29,7 +29,14 @@ from torch import nn
 from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .modeling_utils import PreTrainedModel, prune_linear_layer, SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits
+from .modeling_utils import (
+    PreTrainedModel,
+    prune_linear_layer,
+    SequenceSummary,
+    PoolerAnswerClass,
+    PoolerEndLogits,
+    PoolerStartLogits,
+)
 from .configuration_xlnet import XLNetConfig
 from .file_utils import add_start_docstrings
 
@@ -37,8 +44,8 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin",
-    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin",
+    "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin",
+    "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin",
 }
 
 
@@ -50,44 +57,53 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
 
     tf_to_pt_map = {}
 
-    if hasattr(model, 'transformer'):
-        if hasattr(model, 'lm_loss'):
+    if hasattr(model, "transformer"):
+        if hasattr(model, "lm_loss"):
             # We will load also the output bias
-            tf_to_pt_map['model/lm_loss/bias'] = model.lm_loss.bias
-        if hasattr(model, 'sequence_summary') and 'model/sequnece_summary/summary/kernel' in tf_weights:
+            tf_to_pt_map["model/lm_loss/bias"] = model.lm_loss.bias
+        if hasattr(model, "sequence_summary") and "model/sequnece_summary/summary/kernel" in tf_weights:
             # We will load also the sequence summary
-            tf_to_pt_map['model/sequnece_summary/summary/kernel'] = model.sequence_summary.summary.weight
-            tf_to_pt_map['model/sequnece_summary/summary/bias'] = model.sequence_summary.summary.bias
-        if hasattr(model, 'logits_proj') and config.finetuning_task is not None \
-                and 'model/regression_{}/logit/kernel'.format(config.finetuning_task) in tf_weights:
-            tf_to_pt_map['model/regression_{}/logit/kernel'.format(config.finetuning_task)] = model.logits_proj.weight
-            tf_to_pt_map['model/regression_{}/logit/bias'.format(config.finetuning_task)] = model.logits_proj.bias
+            tf_to_pt_map["model/sequnece_summary/summary/kernel"] = model.sequence_summary.summary.weight
+            tf_to_pt_map["model/sequnece_summary/summary/bias"] = model.sequence_summary.summary.bias
+        if (
+            hasattr(model, "logits_proj")
+            and config.finetuning_task is not None
+            and "model/regression_{}/logit/kernel".format(config.finetuning_task) in tf_weights
+        ):
+            tf_to_pt_map["model/regression_{}/logit/kernel".format(config.finetuning_task)] = model.logits_proj.weight
+            tf_to_pt_map["model/regression_{}/logit/bias".format(config.finetuning_task)] = model.logits_proj.bias
 
         # Now load the rest of the transformer
         model = model.transformer
 
     # Embeddings and output
-    tf_to_pt_map.update({'model/transformer/word_embedding/lookup_table': model.word_embedding.weight,
-                         'model/transformer/mask_emb/mask_emb': model.mask_emb})
+    tf_to_pt_map.update(
+        {
+            "model/transformer/word_embedding/lookup_table": model.word_embedding.weight,
+            "model/transformer/mask_emb/mask_emb": model.mask_emb,
+        }
+    )
 
     # Transformer blocks
     for i, b in enumerate(model.layer):
         layer_str = "model/transformer/layer_%d/" % i
-        tf_to_pt_map.update({
-            layer_str + "rel_attn/LayerNorm/gamma": b.rel_attn.layer_norm.weight,
-            layer_str + "rel_attn/LayerNorm/beta": b.rel_attn.layer_norm.bias,
-            layer_str + "rel_attn/o/kernel": b.rel_attn.o,
-            layer_str + "rel_attn/q/kernel": b.rel_attn.q,
-            layer_str + "rel_attn/k/kernel": b.rel_attn.k,
-            layer_str + "rel_attn/r/kernel": b.rel_attn.r,
-            layer_str + "rel_attn/v/kernel": b.rel_attn.v,
-            layer_str + "ff/LayerNorm/gamma": b.ff.layer_norm.weight,
-            layer_str + "ff/LayerNorm/beta": b.ff.layer_norm.bias,
-            layer_str + "ff/layer_1/kernel": b.ff.layer_1.weight,
-            layer_str + "ff/layer_1/bias": b.ff.layer_1.bias,
-            layer_str + "ff/layer_2/kernel": b.ff.layer_2.weight,
-            layer_str + "ff/layer_2/bias": b.ff.layer_2.bias,
-        })
+        tf_to_pt_map.update(
+            {
+                layer_str + "rel_attn/LayerNorm/gamma": b.rel_attn.layer_norm.weight,
+                layer_str + "rel_attn/LayerNorm/beta": b.rel_attn.layer_norm.bias,
+                layer_str + "rel_attn/o/kernel": b.rel_attn.o,
+                layer_str + "rel_attn/q/kernel": b.rel_attn.q,
+                layer_str + "rel_attn/k/kernel": b.rel_attn.k,
+                layer_str + "rel_attn/r/kernel": b.rel_attn.r,
+                layer_str + "rel_attn/v/kernel": b.rel_attn.v,
+                layer_str + "ff/LayerNorm/gamma": b.ff.layer_norm.weight,
+                layer_str + "ff/LayerNorm/beta": b.ff.layer_norm.bias,
+                layer_str + "ff/layer_1/kernel": b.ff.layer_1.weight,
+                layer_str + "ff/layer_1/bias": b.ff.layer_1.bias,
+                layer_str + "ff/layer_2/kernel": b.ff.layer_2.weight,
+                layer_str + "ff/layer_2/bias": b.ff.layer_2.bias,
+            }
+        )
 
     # Relative positioning biases
     if config.untie_r:
@@ -105,13 +121,17 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
         r_w_list = [model.r_w_bias]
         r_s_list = [model.r_s_bias]
         seg_embed_list = [model.seg_embed]
-    tf_to_pt_map.update({
-        'model/transformer/r_r_bias': r_r_list,
-        'model/transformer/r_w_bias': r_w_list,
-        'model/transformer/r_s_bias': r_s_list,
-        'model/transformer/seg_embed': seg_embed_list})
+    tf_to_pt_map.update(
+        {
+            "model/transformer/r_r_bias": r_r_list,
+            "model/transformer/r_w_bias": r_w_list,
+            "model/transformer/r_s_bias": r_s_list,
+            "model/transformer/seg_embed": seg_embed_list,
+        }
+    )
     return tf_to_pt_map
 
+
 def load_tf_weights_in_xlnet(model, config, tf_path):
     """ Load tf checkpoints in a pytorch model
     """
@@ -119,8 +139,10 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise
     # Load weights from TF model
     init_vars = tf.train.list_variables(tf_path)
@@ -141,7 +163,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
         array = tf_weights[name]
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
-        if 'kernel' in name and ('ff' in name or 'summary' in name or 'logit' in name):
+        if "kernel" in name and ("ff" in name or "summary" in name or "logit" in name):
             logger.info("Transposing")
             array = np.transpose(array)
         if isinstance(pointer, list):
@@ -165,10 +187,10 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
             logger.info("Initialize PyTorch weight {}".format(name))
             pointer.data = torch.from_numpy(array)
         tf_weights.pop(name, None)
-        tf_weights.pop(name + '/Adam', None)
-        tf_weights.pop(name + '/Adam_1', None)
+        tf_weights.pop(name + "/Adam", None)
+        tf_weights.pop(name + "/Adam_1", None)
 
-    logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys())))
     return model
 
 
@@ -199,7 +221,8 @@ class XLNetRelativeAttention(nn.Module):
         if config.d_model % config.n_head != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.d_model, config.n_head))
+                "heads (%d)" % (config.d_model, config.n_head)
+            )
 
         self.n_head = config.n_head
         self.d_head = config.d_head
@@ -242,7 +265,7 @@ class XLNetRelativeAttention(nn.Module):
 
         x = x.reshape(x_size[0], x_size[1], x_size[3], x_size[2])
         x = x[:, :, 1:, :]
-        x = x.reshape(x_size[0], x_size[1], x_size[2], x_size[3]-1)
+        x = x.reshape(x_size[0], x_size[1], x_size[2], x_size[3] - 1)
         # Note: the tensor-slice form was faster in my testing than torch.index_select
         #       However, tracing doesn't like the nature of the slice, and if klen changes
         #       during the run then it'll fail, whereas index_select will be fine.
@@ -255,27 +278,27 @@ class XLNetRelativeAttention(nn.Module):
         """Core relative positional attention operations."""
 
         # content based attention score
-        ac = torch.einsum('ibnd,jbnd->bnij', q_head + self.r_w_bias, k_head_h)
+        ac = torch.einsum("ibnd,jbnd->bnij", q_head + self.r_w_bias, k_head_h)
 
         # position based attention score
-        bd = torch.einsum('ibnd,jbnd->bnij', q_head + self.r_r_bias, k_head_r)
+        bd = torch.einsum("ibnd,jbnd->bnij", q_head + self.r_r_bias, k_head_r)
         bd = self.rel_shift_bnij(bd, klen=ac.shape[3])
 
         # segment based attention score
         if seg_mat is None:
             ef = 0
         else:
-            ef = torch.einsum('ibnd,snd->ibns', q_head + self.r_s_bias, self.seg_embed)
-            ef = torch.einsum('ijbs,ibns->bnij', seg_mat, ef)
+            ef = torch.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed)
+            ef = torch.einsum("ijbs,ibns->bnij", seg_mat, ef)
 
         # merge attention scores and perform masking
         attn_score = (ac + bd + ef) * self.scale
         if attn_mask is not None:
             # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
             if attn_mask.dtype == torch.float16:
-                attn_score = attn_score - 65500 * torch.einsum('ijbn->bnij', attn_mask)
+                attn_score = attn_score - 65500 * torch.einsum("ijbn->bnij", attn_mask)
             else:
-                attn_score = attn_score - 1e30 * torch.einsum('ijbn->bnij', attn_mask)
+                attn_score = attn_score - 1e30 * torch.einsum("ijbn->bnij", attn_mask)
 
         # attention probability
         attn_prob = F.softmax(attn_score, dim=3)
@@ -283,20 +306,20 @@ class XLNetRelativeAttention(nn.Module):
 
         # Mask heads if we want to
         if head_mask is not None:
-            attn_prob = attn_prob * torch.einsum('ijbn->bnij', head_mask)
+            attn_prob = attn_prob * torch.einsum("ijbn->bnij", head_mask)
 
         # attention output
-        attn_vec = torch.einsum('bnij,jbnd->ibnd', attn_prob, v_head_h)
+        attn_vec = torch.einsum("bnij,jbnd->ibnd", attn_prob, v_head_h)
 
         if self.output_attentions:
-            return attn_vec, torch.einsum('bnij->ijbn', attn_prob)
+            return attn_vec, torch.einsum("bnij->ijbn", attn_prob)
 
         return attn_vec
 
     def post_attention(self, h, attn_vec, residual=True):
         """Post-attention processing."""
         # post-attention projection (back to `d_model`)
-        attn_out = torch.einsum('ibnd,hnd->ibh', attn_vec, self.o)
+        attn_out = torch.einsum("ibnd,hnd->ibh", attn_vec, self.o)
 
         attn_out = self.dropout(attn_out)
         if residual:
@@ -305,10 +328,7 @@ class XLNetRelativeAttention(nn.Module):
 
         return output
 
-    def forward(self, h, g,
-                      attn_mask_h, attn_mask_g,
-                      r, seg_mat,
-                      mems=None, target_mapping=None, head_mask=None):
+    def forward(self, h, g, attn_mask_h, attn_mask_g, r, seg_mat, mems=None, target_mapping=None, head_mask=None):
         if g is not None:
             ###### Two-stream attention with relative positional encoding.
             # content based attention score
@@ -318,21 +338,22 @@ class XLNetRelativeAttention(nn.Module):
                 cat = h
 
             # content-based key head
-            k_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.k)
+            k_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.k)
 
             # content-based value head
-            v_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.v)
+            v_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.v)
 
             # position-based key head
-            k_head_r = torch.einsum('ibh,hnd->ibnd', r, self.r)
+            k_head_r = torch.einsum("ibh,hnd->ibnd", r, self.r)
 
             ##### h-stream
             # content-stream query head
-            q_head_h = torch.einsum('ibh,hnd->ibnd', h, self.q)
+            q_head_h = torch.einsum("ibh,hnd->ibnd", h, self.q)
 
             # core attention ops
             attn_vec_h = self.rel_attn_core(
-                q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask)
+                q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask
+            )
 
             if self.output_attentions:
                 attn_vec_h, attn_prob_h = attn_vec_h
@@ -342,21 +363,23 @@ class XLNetRelativeAttention(nn.Module):
 
             ##### g-stream
             # query-stream query head
-            q_head_g = torch.einsum('ibh,hnd->ibnd', g, self.q)
+            q_head_g = torch.einsum("ibh,hnd->ibnd", g, self.q)
 
             # core attention ops
             if target_mapping is not None:
-                q_head_g = torch.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping)
+                q_head_g = torch.einsum("mbnd,mlb->lbnd", q_head_g, target_mapping)
                 attn_vec_g = self.rel_attn_core(
-                    q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask)
+                    q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask
+                )
 
                 if self.output_attentions:
                     attn_vec_g, attn_prob_g = attn_vec_g
 
-                attn_vec_g = torch.einsum('lbnd,mlb->mbnd', attn_vec_g, target_mapping)
+                attn_vec_g = torch.einsum("lbnd,mlb->mbnd", attn_vec_g, target_mapping)
             else:
                 attn_vec_g = self.rel_attn_core(
-                    q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask)
+                    q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask
+                )
 
                 if self.output_attentions:
                     attn_vec_g, attn_prob_g = attn_vec_g
@@ -375,16 +398,17 @@ class XLNetRelativeAttention(nn.Module):
                 cat = h
 
             # content heads
-            q_head_h = torch.einsum('ibh,hnd->ibnd', h, self.q)
-            k_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.k)
-            v_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.v)
+            q_head_h = torch.einsum("ibh,hnd->ibnd", h, self.q)
+            k_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.k)
+            v_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.v)
 
             # positional heads
-            k_head_r = torch.einsum('ibh,hnd->ibnd', r, self.r)
+            k_head_r = torch.einsum("ibh,hnd->ibnd", r, self.r)
 
             # core attention ops
             attn_vec = self.rel_attn_core(
-                q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask)
+                q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask
+            )
 
             if self.output_attentions:
                 attn_vec, attn_prob = attn_vec
@@ -398,6 +422,7 @@ class XLNetRelativeAttention(nn.Module):
             outputs = outputs + (attn_prob,)
         return outputs
 
+
 class XLNetFeedForward(nn.Module):
     def __init__(self, config):
         super(XLNetFeedForward, self).__init__()
@@ -405,8 +430,9 @@ class XLNetFeedForward(nn.Module):
         self.layer_1 = nn.Linear(config.d_model, config.d_inner)
         self.layer_2 = nn.Linear(config.d_inner, config.d_model)
         self.dropout = nn.Dropout(config.dropout)
-        if isinstance(config.ff_activation, str) or \
-                (sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)):
+        if isinstance(config.ff_activation, str) or (
+            sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)
+        ):
             self.activation_function = ACT2FN[config.ff_activation]
         else:
             self.activation_function = config.ff_activation
@@ -421,6 +447,7 @@ class XLNetFeedForward(nn.Module):
         output = self.layer_norm(output + inp)
         return output
 
+
 class XLNetLayer(nn.Module):
     def __init__(self, config):
         super(XLNetLayer, self).__init__()
@@ -428,12 +455,20 @@ class XLNetLayer(nn.Module):
         self.ff = XLNetFeedForward(config)
         self.dropout = nn.Dropout(config.dropout)
 
-    def forward(self, output_h, output_g,
-                attn_mask_h, attn_mask_g,
-                r, seg_mat, mems=None, target_mapping=None, head_mask=None):
-        outputs = self.rel_attn(output_h, output_g, attn_mask_h, attn_mask_g,
-                                r, seg_mat, mems=mems, target_mapping=target_mapping,
-                                head_mask=head_mask)
+    def forward(
+        self, output_h, output_g, attn_mask_h, attn_mask_g, r, seg_mat, mems=None, target_mapping=None, head_mask=None
+    ):
+        outputs = self.rel_attn(
+            output_h,
+            output_g,
+            attn_mask_h,
+            attn_mask_g,
+            r,
+            seg_mat,
+            mems=mems,
+            target_mapping=target_mapping,
+            head_mask=head_mask,
+        )
         output_h, output_g = outputs[:2]
 
         if output_g is not None:
@@ -448,6 +483,7 @@ class XLNetPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = XLNetConfig
     pretrained_model_archive_map = XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_xlnet
@@ -466,12 +502,20 @@ class XLNetPreTrainedModel(PreTrainedModel):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
         elif isinstance(module, XLNetRelativeAttention):
-            for param in [module.q, module.k, module.v, module.o, module.r,
-                          module.r_r_bias, module.r_s_bias, module.r_w_bias,
-                          module.seg_embed]:
+            for param in [
+                module.q,
+                module.k,
+                module.v,
+                module.o,
+                module.r,
+                module.r_r_bias,
+                module.r_s_bias,
+                module.r_w_bias,
+                module.seg_embed,
+            ]:
                 param.data.normal_(mean=0.0, std=self.config.initializer_range)
         elif isinstance(module, XLNetModel):
-                module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range)
+            module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range)
 
 
 XLNET_START_DOCSTRING = r"""    The XLNet model was proposed in
@@ -564,8 +608,12 @@ XLNET_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
-                      XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class XLNetModel(XLNetPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -594,6 +642,7 @@ class XLNetModel(XLNetPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config):
         super(XLNetModel, self).__init__(config)
         self.output_attentions = config.output_attentions
@@ -658,18 +707,18 @@ class XLNetModel(XLNetPreTrainedModel):
     def cache_mem(self, curr_out, prev_mem):
         """cache hidden states into memory."""
         if self.reuse_len is not None and self.reuse_len > 0:
-            curr_out = curr_out[:self.reuse_len]
+            curr_out = curr_out[: self.reuse_len]
 
         if prev_mem is None:
-            new_mem = curr_out[-self.mem_len:]
+            new_mem = curr_out[-self.mem_len :]
         else:
-            new_mem = torch.cat([prev_mem, curr_out], dim=0)[-self.mem_len:]
+            new_mem = torch.cat([prev_mem, curr_out], dim=0)[-self.mem_len :]
 
         return new_mem.detach()
 
     @staticmethod
     def positional_embedding(pos_seq, inv_freq, bsz=None):
-        sinusoid_inp = torch.einsum('i,d->id', pos_seq, inv_freq)
+        sinusoid_inp = torch.einsum("i,d->id", pos_seq, inv_freq)
         pos_emb = torch.cat([torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)], dim=-1)
         pos_emb = pos_emb[:, None, :]
 
@@ -683,14 +732,14 @@ class XLNetModel(XLNetPreTrainedModel):
         freq_seq = torch.arange(0, self.d_model, 2.0, dtype=torch.float)
         inv_freq = 1 / torch.pow(10000, (freq_seq / self.d_model))
 
-        if self.attn_type == 'bi':
+        if self.attn_type == "bi":
             # beg, end = klen - 1, -qlen
             beg, end = klen, -qlen
-        elif self.attn_type == 'uni':
+        elif self.attn_type == "uni":
             # beg, end = klen - 1, -1
             beg, end = klen, -1
         else:
-            raise ValueError('Unknown `attn_type` {}.'.format(self.attn_type))
+            raise ValueError("Unknown `attn_type` {}.".format(self.attn_type))
 
         if self.bi_data:
             fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.float)
@@ -701,8 +750,8 @@ class XLNetModel(XLNetPreTrainedModel):
                 bwd_pos_seq = bwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
 
             if bsz is not None:
-                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz//2)
-                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz//2)
+                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2)
+                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2)
             else:
                 fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
                 bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
@@ -717,8 +766,18 @@ class XLNetModel(XLNetPreTrainedModel):
         pos_emb = pos_emb.to(next(self.parameters()))
         return pos_emb
 
-    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
         # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
         # but we want a unified interface in the library with the batch size on the first dimension
         # so we move here the first dimension (batch) to the end
@@ -739,7 +798,6 @@ class XLNetModel(XLNetPreTrainedModel):
         perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None
         target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
 
-
         mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0
         klen = mlen + qlen
 
@@ -748,13 +806,13 @@ class XLNetModel(XLNetPreTrainedModel):
 
         ##### Attention mask
         # causal attention mask
-        if self.attn_type == 'uni':
+        if self.attn_type == "uni":
             attn_mask = self.create_mask(qlen, mlen)
             attn_mask = attn_mask[:, :, None, None]
-        elif self.attn_type == 'bi':
+        elif self.attn_type == "bi":
             attn_mask = None
         else:
-            raise ValueError('Unsupported attention type: {}'.format(self.attn_type))
+            raise ValueError("Unsupported attention type: {}".format(self.attn_type))
 
         # data mask: input mask & perm mask
         assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) "
@@ -799,9 +857,9 @@ class XLNetModel(XLNetPreTrainedModel):
         output_h = self.dropout(word_emb_k)
         if target_mapping is not None:
             word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1)
-        # else:  # We removed the inp_q input which was same as target mapping
-        #     inp_q_ext = inp_q[:, :, None]
-        #     word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
+            # else:  # We removed the inp_q input which was same as target mapping
+            #     inp_q_ext = inp_q[:, :, None]
+            #     word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
             output_g = self.dropout(word_emb_q)
         else:
             output_g = None
@@ -836,7 +894,9 @@ class XLNetModel(XLNetPreTrainedModel):
                 head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
                 head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.n_layer
 
@@ -853,9 +913,17 @@ class XLNetModel(XLNetPreTrainedModel):
             if self.output_hidden_states:
                 hidden_states.append((output_h, output_g) if output_g is not None else output_h)
 
-            outputs = layer_module(output_h, output_g, attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask,
-                                   r=pos_emb, seg_mat=seg_mat, mems=mems[i], target_mapping=target_mapping,
-                                   head_mask=head_mask[i])
+            outputs = layer_module(
+                output_h,
+                output_g,
+                attn_mask_h=non_tgt_mask,
+                attn_mask_g=attn_mask,
+                r=pos_emb,
+                seg_mat=seg_mat,
+                mems=mems[i],
+                target_mapping=target_mapping,
+                head_mask=head_mask[i],
+            )
             output_h, output_g = outputs[:2]
             if self.output_attentions:
                 attentions.append(outputs[2])
@@ -881,7 +949,9 @@ class XLNetModel(XLNetPreTrainedModel):
         if self.output_attentions:
             if target_mapping is not None:
                 # when target_mapping is provided, there are 2-tuple of attentions
-                attentions = tuple(tuple(att_stream.permute(2, 3, 0, 1).contiguous() for att_stream in t) for t in attentions)
+                attentions = tuple(
+                    tuple(att_stream.permute(2, 3, 0, 1).contiguous() for att_stream in t) for t in attentions
+                )
             else:
                 attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
             outputs = outputs + (attentions,)
@@ -889,9 +959,12 @@ class XLNetModel(XLNetPreTrainedModel):
         return outputs  # outputs, (new_mems), (hidden_states), (attentions)
 
 
-@add_start_docstrings("""XLNet Model with a language modeling head on top
+@add_start_docstrings(
+    """XLNet Model with a language modeling head on top
     (linear layer with weights tied to the input embeddings). """,
-    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class XLNetLMHeadModel(XLNetPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -934,6 +1007,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
 
     """
+
     def __init__(self, config):
         super(XLNetLMHeadModel, self).__init__(config)
         self.attn_type = config.attn_type
@@ -954,34 +1028,42 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
 
         # Build permutation mask so that previous tokens don't see last token
         perm_mask = torch.zeros(
-            (input_ids.shape[0], input_ids.shape[1], input_ids.shape[1]),
-            dtype=torch.float, device=input_ids.device
+            (input_ids.shape[0], input_ids.shape[1], input_ids.shape[1]), dtype=torch.float, device=input_ids.device
         )
         perm_mask[:, :, -1] = 1.0
 
         # We'll only predict the last token
         target_mapping = torch.zeros(
-            (input_ids.shape[0], 1, input_ids.shape[1]),
-            dtype=torch.float, device=input_ids.device
+            (input_ids.shape[0], 1, input_ids.shape[1]), dtype=torch.float, device=input_ids.device
         )
         target_mapping[0, 0, -1] = 1.0
 
-        return {"input_ids": input_ids,
-                "perm_mask": perm_mask,
-                "target_mapping": target_mapping
-               }
+        return {"input_ids": input_ids, "perm_mask": perm_mask, "target_mapping": target_mapping}
 
-    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               attention_mask=attention_mask,
-                                               mems=mems,
-                                               perm_mask=perm_mask,
-                                               target_mapping=target_mapping,
-                                               token_type_ids=token_type_ids,
-                                               input_mask=input_mask,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         logits = self.lm_loss(transformer_outputs[0])
 
@@ -990,16 +1072,18 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         if labels is not None:
             # Flatten the tokens
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, logits.size(-1)),
-                            labels.view(-1))
+            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
             outputs = (loss,) + outputs
 
         return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
 
 
-@add_start_docstrings("""XLNet Model with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """XLNet Model with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class XLNetForSequenceClassification(XLNetPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -1037,6 +1121,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XLNetForSequenceClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -1047,17 +1132,30 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               attention_mask=attention_mask,
-                                               mems=mems,
-                                               perm_mask=perm_mask,
-                                               target_mapping=target_mapping,
-                                               token_type_ids=token_type_ids,
-                                               input_mask=input_mask,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
         output = transformer_outputs[0]
 
         output = self.sequence_summary(output)
@@ -1077,10 +1175,13 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
 
         return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
 
-@add_start_docstrings("""XLNet Model with a token classification head on top (a linear layer on top of
+
+@add_start_docstrings(
+    """XLNet Model with a token classification head on top (a linear layer on top of
                       the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-                      XLNET_START_DOCSTRING,
-                      XLNET_INPUTS_DOCSTRING)
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class XLNetForTokenClassification(XLNetPreTrainedModel):
     r"""
     Inputs:
@@ -1135,6 +1236,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
         scores = outputs[0]
 
     """
+
     def __init__(self, config):
         super(XLNetForTokenClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -1144,18 +1246,31 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
 
-        outputs = self.transformer(input_ids,
-                            attention_mask=attention_mask,
-                            mems=mems,
-                            perm_mask=perm_mask,
-                            target_mapping=target_mapping,
-                            token_type_ids=token_type_ids,
-                            input_mask=input_mask,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output = outputs[0]
 
@@ -1177,9 +1292,12 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
         return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
 
 
-@add_start_docstrings("""XLNet Model with a multiple choice classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """XLNet Model with a multiple choice classification head on top (a linear layer on top of
     the pooled output and a softmax) e.g. for RACE/SWAG tasks. """,
-    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class XLNetForMultipleChoice(XLNetPreTrainedModel):
     r"""
     Inputs:
@@ -1239,6 +1357,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
         loss, classification_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XLNetForMultipleChoice, self).__init__(config)
 
@@ -1248,9 +1367,19 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, token_type_ids=None, input_mask=None, attention_mask=None,
-                mems=None, perm_mask=None, target_mapping=None,
-                labels=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        input_mask=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        labels=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
         num_choices = input_ids.shape[1]
 
         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
@@ -1258,18 +1387,26 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
         flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
         flat_input_mask = input_mask.view(-1, input_mask.size(-1)) if input_mask is not None else None
 
-        transformer_outputs = self.transformer(flat_input_ids, token_type_ids=flat_token_type_ids,
-                                               input_mask=flat_input_mask, attention_mask=flat_attention_mask,
-                                               mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
-                                               head_mask=head_mask, inputs_embeds=inputs_embeds)
-
+        transformer_outputs = self.transformer(
+            flat_input_ids,
+            token_type_ids=flat_token_type_ids,
+            input_mask=flat_input_mask,
+            attention_mask=flat_attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         output = transformer_outputs[0]
 
         output = self.sequence_summary(output)
         logits = self.logits_proj(output)
         reshaped_logits = logits.view(-1, num_choices)
-        outputs = (reshaped_logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+        outputs = (reshaped_logits,) + transformer_outputs[
+            1:
+        ]  # Keep mems, hidden states, attentions if there are in it
 
         if labels is not None:
             loss_fct = CrossEntropyLoss()
@@ -1279,9 +1416,12 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
         return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
 
 
-@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -1325,6 +1465,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
         loss, start_scores, end_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XLNetForQuestionAnsweringSimple, self).__init__(config)
         self.num_labels = config.num_labels
@@ -1334,19 +1475,32 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None,
-                start_positions=None, end_positions=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+    ):
 
-        outputs = self.transformer(input_ids,
-                                    attention_mask=attention_mask,
-                                    mems=mems,
-                                    perm_mask=perm_mask,
-                                    target_mapping=target_mapping,
-                                    token_type_ids=token_type_ids,
-                                    input_mask=input_mask,
-                                    head_mask=head_mask,
-                                    inputs_embeds=inputs_embeds)
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output = outputs[0]
 
@@ -1376,9 +1530,12 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
         return outputs  # (loss), start_logits, end_logits, (mems), (hidden_states), (attentions)
 
 
-@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class XLNetForQuestionAnswering(XLNetPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -1440,6 +1597,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
         loss, start_scores, end_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XLNetForQuestionAnswering, self).__init__(config)
         self.start_n_top = config.start_n_top
@@ -1452,18 +1610,34 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None,
-                start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None,):
-        transformer_outputs = self.transformer(input_ids,
-                                               attention_mask=attention_mask,
-                                               mems=mems,
-                                               perm_mask=perm_mask,
-                                               target_mapping=target_mapping,
-                                               token_type_ids=token_type_ids,
-                                               input_mask=input_mask,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        is_impossible=None,
+        cls_index=None,
+        p_mask=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
         hidden_states = transformer_outputs[0]
         start_logits = self.start_logits(hidden_states, p_mask=p_mask)
 
@@ -1497,24 +1671,34 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
         else:
             # during inference, compute the end logits based on beam search
             bsz, slen, hsz = hidden_states.size()
-            start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen)
+            start_log_probs = F.softmax(start_logits, dim=-1)  # shape (bsz, slen)
 
-            start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top)
-            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
-            start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz)
-            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz)
+            start_top_log_probs, start_top_index = torch.topk(
+                start_log_probs, self.start_n_top, dim=-1
+            )  # shape (bsz, start_n_top)
+            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz)  # shape (bsz, start_n_top, hsz)
+            start_states = torch.gather(hidden_states, -2, start_top_index_exp)  # shape (bsz, start_n_top, hsz)
+            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)
 
-            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz)
+            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
+                start_states
+            )  # shape (bsz, slen, start_n_top, hsz)
             p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
             end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
-            end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top)
+            end_log_probs = F.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
 
-            end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top)
+            end_top_log_probs, end_top_index = torch.topk(
+                end_log_probs, self.end_n_top, dim=1
+            )  # shape (bsz, end_n_top, start_n_top)
             end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
             end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
 
-            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)  # get the representation of START as weighted sum of hidden states
-            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)  # Shape (batch size,): one single `cls_logits` for each sample
+            start_states = torch.einsum(
+                "blh,bl->bh", hidden_states, start_log_probs
+            )  # get the representation of START as weighted sum of hidden states
+            cls_logits = self.answer_class(
+                hidden_states, start_states=start_states, cls_index=cls_index
+            )  # Shape (batch size,): one single `cls_logits` for each sample
 
             outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
 
diff --git a/transformers/optimization.py b/transformers/optimization.py
index 99e6cc75e4..0cd57078ba 100644
--- a/transformers/optimization.py
+++ b/transformers/optimization.py
@@ -34,10 +34,11 @@ def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1
     """ Create a schedule with a constant learning rate preceded by a warmup
     period during which the learning rate increases linearly between 0 and 1.
     """
+
     def lr_lambda(current_step):
         if current_step < num_warmup_steps:
             return float(current_step) / float(max(1.0, num_warmup_steps))
-        return 1.
+        return 1.0
 
     return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
 
@@ -46,40 +47,47 @@ def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_st
     """ Create a schedule with a learning rate that decreases linearly after
     linearly increasing during a warmup period.
     """
+
     def lr_lambda(current_step):
         if current_step < num_warmup_steps:
             return float(current_step) / float(max(1, num_warmup_steps))
-        return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
+        return max(
+            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
+        )
 
     return LambdaLR(optimizer, lr_lambda, last_epoch)
 
 
-def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=.5, last_epoch=-1):
+def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1):
     """ Create a schedule with a learning rate that decreases following the
     values of the cosine function between 0 and `pi * cycles` after a warmup
     period during which it increases linearly between 0 and 1.
     """
+
     def lr_lambda(current_step):
         if current_step < num_warmup_steps:
             return float(current_step) / float(max(1, num_warmup_steps))
         progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
-        return max(0., 0.5 * (1. + math.cos(math.pi * float(num_cycles) * 2. * progress)))
+        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
 
     return LambdaLR(optimizer, lr_lambda, last_epoch)
 
 
-def get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=1., last_epoch=-1):
+def get_cosine_with_hard_restarts_schedule_with_warmup(
+    optimizer, num_warmup_steps, num_training_steps, num_cycles=1.0, last_epoch=-1
+):
     """ Create a schedule with a learning rate that decreases following the
     values of the cosine function with several hard restarts, after a warmup
     period during which it increases linearly between 0 and 1.
     """
+
     def lr_lambda(current_step):
         if current_step < num_warmup_steps:
             return float(current_step) / float(max(1, num_warmup_steps))
         progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
-        if progress >= 1.:
-            return 0.
-        return max(0., 0.5 * (1. + math.cos(math.pi * ((float(num_cycles) * progress) % 1.))))
+        if progress >= 1.0:
+            return 0.0
+        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
 
     return LambdaLR(optimizer, lr_lambda, last_epoch)
 
@@ -94,17 +102,17 @@ class AdamW(Optimizer):
         weight_decay (float): Weight decay. Default: 0.0
         correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True.
     """
+
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True):
         if lr < 0.0:
             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
         if not 0.0 <= betas[0] < 1.0:
             raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
-        if not 0.0 <= betas[1]  < 1.0:
+        if not 0.0 <= betas[1] < 1.0:
             raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
         if not 0.0 <= eps:
             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
-        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
-                        correct_bias=correct_bias)
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias)
         super(AdamW, self).__init__(params, defaults)
 
     def step(self, closure=None):
@@ -119,38 +127,38 @@ class AdamW(Optimizer):
             loss = closure()
 
         for group in self.param_groups:
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 grad = p.grad.data
                 if grad.is_sparse:
-                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                    raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
 
                 state = self.state[p]
 
                 # State initialization
                 if len(state) == 0:
-                    state['step'] = 0
+                    state["step"] = 0
                     # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p.data)
+                    state["exp_avg"] = torch.zeros_like(p.data)
                     # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+                    state["exp_avg_sq"] = torch.zeros_like(p.data)
 
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                beta1, beta2 = group['betas']
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                beta1, beta2 = group["betas"]
 
-                state['step'] += 1
+                state["step"] += 1
 
                 # Decay the first and second moment running average coefficient
                 # In-place operations to update the averages at the same time
                 exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
                 exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad)
-                denom = exp_avg_sq.sqrt().add_(group['eps'])
+                denom = exp_avg_sq.sqrt().add_(group["eps"])
 
-                step_size = group['lr']
-                if group['correct_bias']:  # No bias correction for Bert
-                    bias_correction1 = 1.0 - beta1 ** state['step']
-                    bias_correction2 = 1.0 - beta2 ** state['step']
+                step_size = group["lr"]
+                if group["correct_bias"]:  # No bias correction for Bert
+                    bias_correction1 = 1.0 - beta1 ** state["step"]
+                    bias_correction2 = 1.0 - beta2 ** state["step"]
                     step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
 
                 p.data.addcdiv_(-step_size, exp_avg, denom)
@@ -163,7 +171,7 @@ class AdamW(Optimizer):
                 # with the m/v parameters. This is equivalent to adding the square
                 # of the weights to the loss with plain (non-momentum) SGD.
                 # Add weight decay at the end (fixed version)
-                if group['weight_decay'] > 0.0:
-                    p.data.add_(-group['lr'] * group['weight_decay'], p.data)
+                if group["weight_decay"] > 0.0:
+                    p.data.add_(-group["lr"] * group["weight_decay"], p.data)
 
         return loss
diff --git a/transformers/optimization_tf.py b/transformers/optimization_tf.py
index c5fa248083..bdcbd323ce 100644
--- a/transformers/optimization_tf.py
+++ b/transformers/optimization_tf.py
@@ -24,70 +24,64 @@ import tensorflow as tf
 
 
 class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
-  """Applys a warmup schedule on a given learning rate decay schedule."""
+    """Applys a warmup schedule on a given learning rate decay schedule."""
 
-  def __init__(
-      self,
-      initial_learning_rate,
-      decay_schedule_fn,
-      warmup_steps,
-      power=1.0,
-      name=None):
-    super(WarmUp, self).__init__()
-    self.initial_learning_rate = initial_learning_rate
-    self.warmup_steps = warmup_steps
-    self.power = power
-    self.decay_schedule_fn = decay_schedule_fn
-    self.name = name
+    def __init__(self, initial_learning_rate, decay_schedule_fn, warmup_steps, power=1.0, name=None):
+        super(WarmUp, self).__init__()
+        self.initial_learning_rate = initial_learning_rate
+        self.warmup_steps = warmup_steps
+        self.power = power
+        self.decay_schedule_fn = decay_schedule_fn
+        self.name = name
 
-  def __call__(self, step):
-    with tf.name_scope(self.name or 'WarmUp') as name:
-      # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
-      # learning rate will be `global_step/num_warmup_steps * init_lr`.
-      global_step_float = tf.cast(step, tf.float32)
-      warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
-      warmup_percent_done = global_step_float / warmup_steps_float
-      warmup_learning_rate = (
-          self.initial_learning_rate *
-          tf.math.pow(warmup_percent_done, self.power))
-      return tf.cond(global_step_float < warmup_steps_float,
-                     lambda: warmup_learning_rate,
-                     lambda: self.decay_schedule_fn(step),
-                     name=name)
+    def __call__(self, step):
+        with tf.name_scope(self.name or "WarmUp") as name:
+            # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
+            # learning rate will be `global_step/num_warmup_steps * init_lr`.
+            global_step_float = tf.cast(step, tf.float32)
+            warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
+            warmup_percent_done = global_step_float / warmup_steps_float
+            warmup_learning_rate = self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power)
+            return tf.cond(
+                global_step_float < warmup_steps_float,
+                lambda: warmup_learning_rate,
+                lambda: self.decay_schedule_fn(step),
+                name=name,
+            )
 
-  def get_config(self):
-    return {
-        'initial_learning_rate': self.initial_learning_rate,
-        'decay_schedule_fn': self.decay_schedule_fn,
-        'warmup_steps': self.warmup_steps,
-        'power': self.power,
-        'name': self.name
-    }
+    def get_config(self):
+        return {
+            "initial_learning_rate": self.initial_learning_rate,
+            "decay_schedule_fn": self.decay_schedule_fn,
+            "warmup_steps": self.warmup_steps,
+            "power": self.power,
+            "name": self.name,
+        }
 
 
 def create_optimizer(init_lr, num_train_steps, num_warmup_steps):
-  """Creates an optimizer with learning rate schedule."""
-  # Implements linear decay of the learning rate.
-  learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
-      initial_learning_rate=init_lr,
-      decay_steps=num_train_steps,
-      end_learning_rate=0.0)
-  if num_warmup_steps:
-    learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
-                              decay_schedule_fn=learning_rate_fn,
-                              warmup_steps=num_warmup_steps)
-  optimizer = AdamWeightDecay(
-      learning_rate=learning_rate_fn,
-      weight_decay_rate=0.01,
-      beta_1=0.9,
-      beta_2=0.999,
-      epsilon=1e-6,
-      exclude_from_weight_decay=['layer_norm', 'bias'])
-  return optimizer
+    """Creates an optimizer with learning rate schedule."""
+    # Implements linear decay of the learning rate.
+    learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
+        initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=0.0
+    )
+    if num_warmup_steps:
+        learning_rate_fn = WarmUp(
+            initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps
+        )
+    optimizer = AdamWeightDecay(
+        learning_rate=learning_rate_fn,
+        weight_decay_rate=0.01,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-6,
+        exclude_from_weight_decay=["layer_norm", "bias"],
+    )
+    return optimizer
 
 
 class AdamWeightDecay(tf.keras.optimizers.Adam):
-  """Adam enables L2 weight decay and clip_by_global_norm on gradients.
+    """Adam enables L2 weight decay and clip_by_global_norm on gradients.
 
   Just adding the square of the weights to the loss function is *not* the
   correct way of using L2 regularization/weight decay with Adam, since that will
@@ -98,99 +92,94 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
   the loss with plain (non-momentum) SGD.
   """
 
-  def __init__(self,
-               learning_rate=0.001,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=1e-7,
-               amsgrad=False,
-               weight_decay_rate=0.0,
-               include_in_weight_decay=None,
-               exclude_from_weight_decay=None,
-               name='AdamWeightDecay',
-               **kwargs):
-    super(AdamWeightDecay, self).__init__(
-        learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
-    self.weight_decay_rate = weight_decay_rate
-    self._include_in_weight_decay = include_in_weight_decay
-    self._exclude_from_weight_decay = exclude_from_weight_decay
+    def __init__(
+        self,
+        learning_rate=0.001,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-7,
+        amsgrad=False,
+        weight_decay_rate=0.0,
+        include_in_weight_decay=None,
+        exclude_from_weight_decay=None,
+        name="AdamWeightDecay",
+        **kwargs
+    ):
+        super(AdamWeightDecay, self).__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
+        self.weight_decay_rate = weight_decay_rate
+        self._include_in_weight_decay = include_in_weight_decay
+        self._exclude_from_weight_decay = exclude_from_weight_decay
 
-  @classmethod
-  def from_config(cls, config):
-    """Creates an optimizer from its config with WarmUp custom object."""
-    custom_objects = {'WarmUp': WarmUp}
-    return super(AdamWeightDecay, cls).from_config(
-        config, custom_objects=custom_objects)
+    @classmethod
+    def from_config(cls, config):
+        """Creates an optimizer from its config with WarmUp custom object."""
+        custom_objects = {"WarmUp": WarmUp}
+        return super(AdamWeightDecay, cls).from_config(config, custom_objects=custom_objects)
 
-  def _prepare_local(self, var_device, var_dtype, apply_state):
-    super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,
-                                                apply_state)
-    apply_state['weight_decay_rate'] = tf.constant(
-        self.weight_decay_rate, name='adam_weight_decay_rate')
+    def _prepare_local(self, var_device, var_dtype, apply_state):
+        super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype, apply_state)
+        apply_state["weight_decay_rate"] = tf.constant(self.weight_decay_rate, name="adam_weight_decay_rate")
 
-  def _decay_weights_op(self, var, learning_rate, apply_state):
-    do_decay = self._do_use_weight_decay(var.name)
-    if do_decay:
-      return var.assign_sub(
-          learning_rate * var *
-          apply_state['weight_decay_rate'],
-          use_locking=self._use_locking)
-    return tf.no_op()
+    def _decay_weights_op(self, var, learning_rate, apply_state):
+        do_decay = self._do_use_weight_decay(var.name)
+        if do_decay:
+            return var.assign_sub(
+                learning_rate * var * apply_state["weight_decay_rate"], use_locking=self._use_locking
+            )
+        return tf.no_op()
 
-  def apply_gradients(self, grads_and_vars, clip_norm, name=None):
-    grads, tvars = list(zip(*grads_and_vars))
-    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
-    return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars))
+    def apply_gradients(self, grads_and_vars, clip_norm, name=None):
+        grads, tvars = list(zip(*grads_and_vars))
+        (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
+        return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars))
 
-  def _get_lr(self, var_device, var_dtype, apply_state):
-    """Retrieves the learning rate with the given state."""
-    if apply_state is None:
-      return self._decayed_lr_t[var_dtype], {}
+    def _get_lr(self, var_device, var_dtype, apply_state):
+        """Retrieves the learning rate with the given state."""
+        if apply_state is None:
+            return self._decayed_lr_t[var_dtype], {}
 
-    apply_state = apply_state or {}
-    coefficients = apply_state.get((var_device, var_dtype))
-    if coefficients is None:
-      coefficients = self._fallback_apply_state(var_device, var_dtype)
-      apply_state[(var_device, var_dtype)] = coefficients
+        apply_state = apply_state or {}
+        coefficients = apply_state.get((var_device, var_dtype))
+        if coefficients is None:
+            coefficients = self._fallback_apply_state(var_device, var_dtype)
+            apply_state[(var_device, var_dtype)] = coefficients
 
-    return coefficients['lr_t'], dict(apply_state=apply_state)
+        return coefficients["lr_t"], dict(apply_state=apply_state)
 
-  def _resource_apply_dense(self, grad, var, apply_state=None):
-    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
-    decay = self._decay_weights_op(var, lr_t, apply_state)
-    with tf.control_dependencies([decay]):
-      return super(AdamWeightDecay, self)._resource_apply_dense(
-          grad, var, **kwargs)
+    def _resource_apply_dense(self, grad, var, apply_state=None):
+        lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+        decay = self._decay_weights_op(var, lr_t, apply_state)
+        with tf.control_dependencies([decay]):
+            return super(AdamWeightDecay, self)._resource_apply_dense(grad, var, **kwargs)
 
-  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
-    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
-    decay = self._decay_weights_op(var, lr_t, apply_state)
-    with tf.control_dependencies([decay]):
-      return super(AdamWeightDecay, self)._resource_apply_sparse(
-          grad, var, indices, **kwargs)
+    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+        lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+        decay = self._decay_weights_op(var, lr_t, apply_state)
+        with tf.control_dependencies([decay]):
+            return super(AdamWeightDecay, self)._resource_apply_sparse(grad, var, indices, **kwargs)
 
-  def get_config(self):
-    config = super(AdamWeightDecay, self).get_config()
-    config.update({
-        'weight_decay_rate': self.weight_decay_rate,
-    })
-    return config
+    def get_config(self):
+        config = super(AdamWeightDecay, self).get_config()
+        config.update(
+            {"weight_decay_rate": self.weight_decay_rate,}
+        )
+        return config
 
-  def _do_use_weight_decay(self, param_name):
-    """Whether to use L2 weight decay for `param_name`."""
-    if self.weight_decay_rate == 0:
-      return False
+    def _do_use_weight_decay(self, param_name):
+        """Whether to use L2 weight decay for `param_name`."""
+        if self.weight_decay_rate == 0:
+            return False
 
-    if self._include_in_weight_decay:
-      for r in self._include_in_weight_decay:
-        if re.search(r, param_name) is not None:
-          return True
+        if self._include_in_weight_decay:
+            for r in self._include_in_weight_decay:
+                if re.search(r, param_name) is not None:
+                    return True
 
-    if self._exclude_from_weight_decay:
-      for r in self._exclude_from_weight_decay:
-        if re.search(r, param_name) is not None:
-          return False
-    return True
+        if self._exclude_from_weight_decay:
+            for r in self._exclude_from_weight_decay:
+                if re.search(r, param_name) is not None:
+                    return False
+        return True
 
 
 ## Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
@@ -201,10 +190,8 @@ class GradientAccumulator(object):
         """Initializes the accumulator."""
         self._gradients = []
         self._accum_steps = tf.Variable(
-            initial_value=0,
-            dtype=tf.int64,
-            trainable=False,
-            aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
+            initial_value=0, dtype=tf.int64, trainable=False, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA
+        )
 
     @property
     def step(self):
@@ -214,12 +201,19 @@ class GradientAccumulator(object):
     @property
     def gradients(self):
         """The accumulated gradients."""
-        return list(gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients())
+        return list(
+            gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients()
+        )
 
     def __call__(self, gradients):
         """Accumulates :obj:`gradients`."""
         if not self._gradients:
-            self._gradients.extend([tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient for gradient in gradients])
+            self._gradients.extend(
+                [
+                    tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient
+                    for gradient in gradients
+                ]
+            )
 
         if len(gradients) != len(self._gradients):
             raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients)))
@@ -249,6 +243,9 @@ class GradientAccumulator(object):
             if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1:
                 return self._gradients
 
-            return (gradient.device_map.select_for_current_replica(gradient.values, replica_context) for gradient in self._gradients)
+            return (
+                gradient.device_map.select_for_current_replica(gradient.values, replica_context)
+                for gradient in self._gradients
+            )
         else:
             return self._gradients
diff --git a/transformers/pipelines.py b/transformers/pipelines.py
index f4bf3da685..4149c2e475 100755
--- a/transformers/pipelines.py
+++ b/transformers/pipelines.py
@@ -30,25 +30,42 @@ from typing import Union, Optional, Tuple, List, Dict
 
 import numpy as np
 
-from transformers import (AutoConfig, AutoTokenizer, PreTrainedTokenizer,
-                          PretrainedConfig, ModelCard, SquadExample,
-                          squad_convert_examples_to_features, is_tf_available,
-                          is_torch_available, BasicTokenizer,
-                          ALL_PRETRAINED_CONFIG_ARCHIVE_MAP)
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    PreTrainedTokenizer,
+    PretrainedConfig,
+    ModelCard,
+    SquadExample,
+    squad_convert_examples_to_features,
+    is_tf_available,
+    is_torch_available,
+    BasicTokenizer,
+    ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+)
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers import TFAutoModel, TFAutoModelForSequenceClassification, \
-        TFAutoModelForQuestionAnswering, TFAutoModelForTokenClassification
+    from transformers import (
+        TFAutoModel,
+        TFAutoModelForSequenceClassification,
+        TFAutoModelForQuestionAnswering,
+        TFAutoModelForTokenClassification,
+    )
 
 if is_torch_available():
     import torch
-    from transformers import AutoModel, AutoModelForSequenceClassification, \
-        AutoModelForQuestionAnswering, AutoModelForTokenClassification
+    from transformers import (
+        AutoModel,
+        AutoModelForSequenceClassification,
+        AutoModelForQuestionAnswering,
+        AutoModelForTokenClassification,
+    )
 
 
 logger = logging.getLogger(__name__)
 
+
 def get_framework(model=None):
     """ Select framework (TensorFlow/PyTorch) to use.
         If both frameworks are installed and no specific model is provided, defaults to using PyTorch.
@@ -56,20 +73,24 @@ def get_framework(model=None):
     if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str):
         # Both framework are available but the use supplied a model class instance.
         # Try to guess which framework to use from the model classname
-        framework = 'tf' if model.__class__.__name__.startswith('TF') else 'pt'
+        framework = "tf" if model.__class__.__name__.startswith("TF") else "pt"
     elif not is_tf_available() and not is_torch_available():
-        raise ImportError("At least one of TensorFlow 2.0 or PyTorch should be installed. "
-                          "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
-                          "To install PyTorch, read the instructions at https://pytorch.org/.")
+        raise ImportError(
+            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
+            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
+            "To install PyTorch, read the instructions at https://pytorch.org/."
+        )
     else:
         # framework = 'tf' if is_tf_available() else 'pt'
-        framework = 'pt' if is_torch_available() else 'tf'
+        framework = "pt" if is_torch_available() else "tf"
     return framework
 
+
 class ArgumentHandler(ABC):
     """
     Base interface for handling varargs for each Pipeline
     """
+
     @abstractmethod
     def __call__(self, *args, **kwargs):
         raise NotImplementedError()
@@ -79,11 +100,12 @@ class DefaultArgumentHandler(ArgumentHandler):
     """
     Default varargs argument parser handling parameters for each Pipeline
     """
+
     def __call__(self, *args, **kwargs):
-        if 'X' in kwargs:
-            return kwargs['X']
-        elif 'data' in kwargs:
-            return kwargs['data']
+        if "X" in kwargs:
+            return kwargs["X"]
+        elif "data" in kwargs:
+            return kwargs["data"]
         elif len(args) == 1:
             if isinstance(args[0], list):
                 return args[0]
@@ -91,7 +113,7 @@ class DefaultArgumentHandler(ArgumentHandler):
                 return [args[0]]
         elif len(args) > 1:
             return list(args)
-        raise ValueError('Unable to infer the format of the provided data (X=, data=, ...)')
+        raise ValueError("Unable to infer the format of the provided data (X=, data=, ...)")
 
 
 class PipelineDataFormat:
@@ -105,24 +127,25 @@ class PipelineDataFormat:
     PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns
     to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
     """
-    SUPPORTED_FORMATS = ['json', 'csv', 'pipe']
+
+    SUPPORTED_FORMATS = ["json", "csv", "pipe"]
 
     def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
         self.output_path = output_path
         self.input_path = input_path
-        self.column = column.split(',') if column is not None else ['']
+        self.column = column.split(",") if column is not None else [""]
         self.is_multi_columns = len(self.column) > 1
 
         if self.is_multi_columns:
-            self.column = [tuple(c.split('=')) if '=' in c else (c, c) for c in self.column]
+            self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column]
 
         if output_path is not None and not overwrite:
             if exists(abspath(self.output_path)):
-                raise OSError('{} already exists on disk'.format(self.output_path))
+                raise OSError("{} already exists on disk".format(self.output_path))
 
         if input_path is not None:
             if not exists(abspath(self.input_path)):
-                raise OSError('{} doesnt exist on disk'.format(self.input_path))
+                raise OSError("{} doesnt exist on disk".format(self.input_path))
 
     @abstractmethod
     def __iter__(self):
@@ -144,23 +167,25 @@ class PipelineDataFormat:
         :return: (str) Path where the data has been saved
         """
         path, _ = os.path.splitext(self.output_path)
-        binary_path = os.path.extsep.join((path, 'pickle'))
+        binary_path = os.path.extsep.join((path, "pickle"))
 
-        with open(binary_path, 'wb+') as f_output:
+        with open(binary_path, "wb+") as f_output:
             pickle.dump(data, f_output)
 
         return binary_path
 
     @staticmethod
-    def from_str(format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
-        if format == 'json':
+    def from_str(
+        format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False
+    ):
+        if format == "json":
             return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
-        elif format == 'csv':
+        elif format == "csv":
             return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
-        elif format == 'pipe':
+        elif format == "pipe":
             return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
         else:
-            raise KeyError('Unknown reader {} (Available reader are json/csv/pipe)'.format(format))
+            raise KeyError("Unknown reader {} (Available reader are json/csv/pipe)".format(format))
 
 
 class CsvPipelineDataFormat(PipelineDataFormat):
@@ -168,7 +193,7 @@ class CsvPipelineDataFormat(PipelineDataFormat):
         super().__init__(output_path, input_path, column, overwrite=overwrite)
 
     def __iter__(self):
-        with open(self.input_path, 'r') as f:
+        with open(self.input_path, "r") as f:
             reader = csv.DictReader(f)
             for row in reader:
                 if self.is_multi_columns:
@@ -177,7 +202,7 @@ class CsvPipelineDataFormat(PipelineDataFormat):
                     yield row[self.column[0]]
 
     def save(self, data: List[dict]):
-        with open(self.output_path, 'w') as f:
+        with open(self.output_path, "w") as f:
             if len(data) > 0:
                 writer = csv.DictWriter(f, list(data[0].keys()))
                 writer.writeheader()
@@ -188,7 +213,7 @@ class JsonPipelineDataFormat(PipelineDataFormat):
     def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
         super().__init__(output_path, input_path, column, overwrite=overwrite)
 
-        with open(input_path, 'r') as f:
+        with open(input_path, "r") as f:
             self._entries = json.load(f)
 
     def __iter__(self):
@@ -199,7 +224,7 @@ class JsonPipelineDataFormat(PipelineDataFormat):
                 yield entry[self.column[0]]
 
     def save(self, data: dict):
-        with open(self.output_path, 'w') as f:
+        with open(self.output_path, "w") as f:
             json.dump(data, f)
 
 
@@ -210,12 +235,13 @@ class PipedPipelineDataFormat(PipelineDataFormat):
 
     If columns are provided, then the output will be a dictionary with {column_x: value_x}
     """
+
     def __iter__(self):
         for line in sys.stdin:
             # Split for multi-columns
-            if '\t' in line:
+            if "\t" in line:
 
-                line = line.split('\t')
+                line = line.split("\t")
                 if self.column:
                     # Dictionary to map arguments
                     yield {kwargs: l for (kwargs, _), l in zip(self.column, line)}
@@ -232,8 +258,8 @@ class PipedPipelineDataFormat(PipelineDataFormat):
     def save_binary(self, data: Union[dict, List[dict]]) -> str:
         if self.output_path is None:
             raise KeyError(
-                'When using piped input on pipeline outputting large object requires an output file path. '
-                'Please provide such output path through --output argument.'
+                "When using piped input on pipeline outputting large object requires an output file path. "
+                "Please provide such output path through --output argument."
             )
 
         return super().save_binary(data)
@@ -298,10 +324,16 @@ class Pipeline(_ScikitCompat):
 
     default_input_names = None
 
-    def __init__(self, model, tokenizer: PreTrainedTokenizer = None,
-                 modelcard: ModelCard = None, framework: Optional[str] = None,
-                 args_parser: ArgumentHandler = None, device: int = -1,
-                 binary_output: bool = False):
+    def __init__(
+        self,
+        model,
+        tokenizer: PreTrainedTokenizer = None,
+        modelcard: ModelCard = None,
+        framework: Optional[str] = None,
+        args_parser: ArgumentHandler = None,
+        device: int = -1,
+        binary_output: bool = False,
+    ):
 
         if framework is None:
             framework = get_framework()
@@ -315,8 +347,8 @@ class Pipeline(_ScikitCompat):
         self._args_parser = args_parser or DefaultArgumentHandler()
 
         # Special handling
-        if self.device >= 0 and self.framework == 'pt':
-            self.model = self.model.to('cuda:{}'.format(self.device))
+        if self.device >= 0 and self.framework == "pt":
+            self.model = self.model.to("cuda:{}".format(self.device))
 
     def save_pretrained(self, save_directory):
         """
@@ -356,8 +388,8 @@ class Pipeline(_ScikitCompat):
         Returns:
             Context manager
         """
-        if self.framework == 'tf':
-            with tf.device('/CPU:0' if self.device == -1 else '/device:GPU:{}'.format(self.device)):
+        if self.framework == "tf":
+            with tf.device("/CPU:0" if self.device == -1 else "/device:GPU:{}".format(self.device)):
                 yield
         else:
             if self.device >= 0:
@@ -372,11 +404,11 @@ class Pipeline(_ScikitCompat):
         Returns:
             dict holding all the required parameters for model's forward
         """
-        args = ['input_ids', 'attention_mask']
+        args = ["input_ids", "attention_mask"]
         model_type = type(self.model).__name__.lower()
 
-        if 'distilbert' not in model_type and 'xlm' not in model_type:
-            args += ['token_type_ids']
+        if "distilbert" not in model_type and "xlm" not in model_type:
+            args += ["token_type_ids"]
 
         # PR #1548 (CLI) There is an issue with attention_mask
         # if 'xlnet' in model_type or 'xlm' in model_type:
@@ -394,9 +426,7 @@ class Pipeline(_ScikitCompat):
         # Encode for forward
         with self.device_placement():
             inputs = self.tokenizer.batch_encode_plus(
-                inputs, add_special_tokens=True,
-                return_tensors=self.framework,
-                max_length=self.tokenizer.max_len
+                inputs, add_special_tokens=True, return_tensors=self.framework, max_length=self.tokenizer.max_len
             )
 
             # Filter out features not available on specific models
@@ -411,7 +441,7 @@ class Pipeline(_ScikitCompat):
         Returns:
             Numpy array
         """
-        if self.framework == 'tf':
+        if self.framework == "tf":
             # TODO trace model
             predictions = self.model(inputs, training=False)[0]
         else:
@@ -426,19 +456,24 @@ class FeatureExtractionPipeline(Pipeline):
     Feature extraction pipeline using Model head.
     """
 
-    def __init__(self, model,
-                 tokenizer: PreTrainedTokenizer = None,
-                 modelcard: ModelCard = None,
-                 framework: Optional[str] = None,
-                 args_parser: ArgumentHandler = None,
-                 device: int = -1):
-        super().__init__(model=model,
-                         tokenizer=tokenizer,
-                         modelcard=modelcard,
-                         framework=framework,
-                         args_parser=args_parser,
-                         device=device,
-                         binary_output=True)
+    def __init__(
+        self,
+        model,
+        tokenizer: PreTrainedTokenizer = None,
+        modelcard: ModelCard = None,
+        framework: Optional[str] = None,
+        args_parser: ArgumentHandler = None,
+        device: int = -1,
+    ):
+        super().__init__(
+            model=model,
+            tokenizer=tokenizer,
+            modelcard=modelcard,
+            framework=framework,
+            args_parser=args_parser,
+            device=device,
+            binary_output=True,
+        )
 
     def __call__(self, *args, **kwargs):
         return super().__call__(*args, **kwargs).tolist()
@@ -452,7 +487,7 @@ class TextClassificationPipeline(Pipeline):
     def __call__(self, *args, **kwargs):
         outputs = super().__call__(*args, **kwargs)
         scores = np.exp(outputs) / np.exp(outputs).sum(-1)
-        return [{'label': self.model.config.id2label[item.argmax()], 'score': item.max()} for item in scores]
+        return [{"label": self.model.config.id2label[item.argmax()], "score": item.max()} for item in scores]
 
 
 class NerPipeline(Pipeline):
@@ -460,19 +495,28 @@ class NerPipeline(Pipeline):
     Named Entity Recognition pipeline using ModelForTokenClassification head.
     """
 
-    default_input_names = 'sequences'
+    default_input_names = "sequences"
 
-    def __init__(self, model, tokenizer: PreTrainedTokenizer = None,
-                 modelcard: ModelCard = None, framework: Optional[str] = None,
-                 args_parser: ArgumentHandler = None, device: int = -1,
-                 binary_output: bool = False, ignore_labels=['O']):
-        super().__init__(model=model,
-                         tokenizer=tokenizer,
-                         modelcard=modelcard,
-                         framework=framework,
-                         args_parser=args_parser,
-                         device=device,
-                         binary_output=binary_output)
+    def __init__(
+        self,
+        model,
+        tokenizer: PreTrainedTokenizer = None,
+        modelcard: ModelCard = None,
+        framework: Optional[str] = None,
+        args_parser: ArgumentHandler = None,
+        device: int = -1,
+        binary_output: bool = False,
+        ignore_labels=["O"],
+    ):
+        super().__init__(
+            model=model,
+            tokenizer=tokenizer,
+            modelcard=modelcard,
+            framework=framework,
+            args_parser=args_parser,
+            device=device,
+            binary_output=binary_output,
+        )
 
         self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
         self.ignore_labels = ignore_labels
@@ -485,19 +529,20 @@ class NerPipeline(Pipeline):
             with self.device_placement():
 
                 tokens = self.tokenizer.encode_plus(
-                    sentence, return_attention_mask=False,
+                    sentence,
+                    return_attention_mask=False,
                     return_tensors=self.framework,
-                    max_length=self.tokenizer.max_len
+                    max_length=self.tokenizer.max_len,
                 )
 
                 # Forward
-                if self.framework == 'tf':
+                if self.framework == "tf":
                     entities = self.model(tokens)[0][0].numpy()
-                    input_ids = tokens['input_ids'].numpy()[0]
+                    input_ids = tokens["input_ids"].numpy()[0]
                 else:
                     with torch.no_grad():
                         entities = self.model(**tokens)[0][0].cpu().numpy()
-                        input_ids = tokens['input_ids'].cpu().numpy()[0]
+                        input_ids = tokens["input_ids"].cpu().numpy()[0]
 
             score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
             labels_idx = score.argmax(axis=-1)
@@ -505,11 +550,13 @@ class NerPipeline(Pipeline):
             answer = []
             for idx, label_idx in enumerate(labels_idx):
                 if self.model.config.id2label[label_idx] not in self.ignore_labels:
-                    answer += [{
-                        'word': self.tokenizer.decode([int(input_ids[idx])]),
-                        'score': score[idx][label_idx].item(),
-                        'entity': self.model.config.id2label[label_idx]
-                    }]
+                    answer += [
+                        {
+                            "word": self.tokenizer.decode([int(input_ids[idx])]),
+                            "score": score[idx][label_idx].item(),
+                            "entity": self.model.config.id2label[label_idx],
+                        }
+                    ]
 
             # Append
             answers += [answer]
@@ -526,18 +573,19 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler):
     QuestionAnsweringArgumentHandler manages all the possible to create SquadExample from the command-line supplied
     arguments.
     """
+
     def __call__(self, *args, **kwargs):
         # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating
         if args is not None and len(args) > 0:
             if len(args) == 1:
-                kwargs['X'] = args[0]
+                kwargs["X"] = args[0]
             else:
-                kwargs['X'] = list(args)
+                kwargs["X"] = list(args)
 
         # Generic compatibility with sklearn and Keras
         # Batched data
-        if 'X' in kwargs or 'data' in kwargs:
-            inputs = kwargs['X'] if 'X' in kwargs else kwargs['data']
+        if "X" in kwargs or "data" in kwargs:
+            inputs = kwargs["X"] if "X" in kwargs else kwargs["data"]
 
             if isinstance(inputs, dict):
                 inputs = [inputs]
@@ -547,28 +595,31 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler):
 
             for i, item in enumerate(inputs):
                 if isinstance(item, dict):
-                    if any(k not in item for k in ['question', 'context']):
-                        raise KeyError('You need to provide a dictionary with keys {question:..., context:...}')
+                    if any(k not in item for k in ["question", "context"]):
+                        raise KeyError("You need to provide a dictionary with keys {question:..., context:...}")
 
                     inputs[i] = QuestionAnsweringPipeline.create_sample(**item)
 
                 elif not isinstance(item, SquadExample):
                     raise ValueError(
-                        '{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)'
-                            .format('X' if 'X' in kwargs else 'data')
+                        "{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)".format(
+                            "X" if "X" in kwargs else "data"
+                        )
                     )
 
             # Tabular input
-        elif 'question' in kwargs and 'context' in kwargs:
-            if isinstance(kwargs['question'], str):
-                kwargs['question'] = [kwargs['question']]
+        elif "question" in kwargs and "context" in kwargs:
+            if isinstance(kwargs["question"], str):
+                kwargs["question"] = [kwargs["question"]]
 
-            if isinstance(kwargs['context'], str):
-                kwargs['context'] = [kwargs['context']]
+            if isinstance(kwargs["context"], str):
+                kwargs["context"] = [kwargs["context"]]
 
-            inputs = [QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs['question'], kwargs['context'])]
+            inputs = [
+                QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs["question"], kwargs["context"])
+            ]
         else:
-            raise ValueError('Unknown arguments {}'.format(kwargs))
+            raise ValueError("Unknown arguments {}".format(kwargs))
 
         if not isinstance(inputs, list):
             inputs = [inputs]
@@ -581,22 +632,31 @@ class QuestionAnsweringPipeline(Pipeline):
     Question Answering pipeline using ModelForQuestionAnswering head.
     """
 
-    default_input_names = 'question,context'
+    default_input_names = "question,context"
 
-    def __init__(self, model,
-                 tokenizer: Optional[PreTrainedTokenizer],
-                 modelcard: Optional[ModelCard],
-                 framework: Optional[str] = None,
-                 device: int = -1, **kwargs):
-        super().__init__(model=model,
-                         tokenizer=tokenizer,
-                         modelcard=modelcard,
-                         framework=framework,
-                         args_parser=QuestionAnsweringArgumentHandler(),
-                         device=device, **kwargs)
+    def __init__(
+        self,
+        model,
+        tokenizer: Optional[PreTrainedTokenizer],
+        modelcard: Optional[ModelCard],
+        framework: Optional[str] = None,
+        device: int = -1,
+        **kwargs
+    ):
+        super().__init__(
+            model=model,
+            tokenizer=tokenizer,
+            modelcard=modelcard,
+            framework=framework,
+            args_parser=QuestionAnsweringArgumentHandler(),
+            device=device,
+            **kwargs
+        )
 
     @staticmethod
-    def create_sample(question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[SquadExample, List[SquadExample]]:
+    def create_sample(
+        question: Union[str, List[str]], context: Union[str, List[str]]
+    ) -> Union[SquadExample, List[SquadExample]]:
         """
         QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally.
         This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s).
@@ -629,26 +689,28 @@ class QuestionAnsweringPipeline(Pipeline):
             end: the character index in the original string corresponding to the ending of the answer' span
         """
         # Set defaults values
-        kwargs.setdefault('topk', 1)
-        kwargs.setdefault('doc_stride', 128)
-        kwargs.setdefault('max_answer_len', 15)
-        kwargs.setdefault('max_seq_len', 384)
-        kwargs.setdefault('max_question_len', 64)
+        kwargs.setdefault("topk", 1)
+        kwargs.setdefault("doc_stride", 128)
+        kwargs.setdefault("max_answer_len", 15)
+        kwargs.setdefault("max_seq_len", 384)
+        kwargs.setdefault("max_question_len", 64)
 
-        if kwargs['topk'] < 1:
-            raise ValueError('topk parameter should be >= 1 (got {})'.format(kwargs['topk']))
+        if kwargs["topk"] < 1:
+            raise ValueError("topk parameter should be >= 1 (got {})".format(kwargs["topk"]))
 
-        if kwargs['max_answer_len'] < 1:
-            raise ValueError('max_answer_len parameter should be >= 1 (got {})'.format(kwargs['max_answer_len']))
+        if kwargs["max_answer_len"] < 1:
+            raise ValueError("max_answer_len parameter should be >= 1 (got {})".format(kwargs["max_answer_len"]))
 
         # Convert inputs to features
         examples = self._args_parser(*texts, **kwargs)
-        features = squad_convert_examples_to_features(examples, self.tokenizer, kwargs['max_seq_len'], kwargs['doc_stride'], kwargs['max_question_len'], False)
+        features = squad_convert_examples_to_features(
+            examples, self.tokenizer, kwargs["max_seq_len"], kwargs["doc_stride"], kwargs["max_question_len"], False
+        )
         fw_args = self.inputs_for_model([f.__dict__ for f in features])
 
         # Manage tensor allocation on correct device
         with self.device_placement():
-            if self.framework == 'tf':
+            if self.framework == "tf":
                 fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
                 start, end = self.model(fw_args)
                 start, end = start.numpy(), end.numpy()
@@ -672,16 +734,18 @@ class QuestionAnsweringPipeline(Pipeline):
             # Mask CLS
             start_[0] = end_[0] = 0
 
-            starts, ends, scores = self.decode(start_, end_, kwargs['topk'], kwargs['max_answer_len'])
+            starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
             char_to_word = np.array(example.char_to_word_offset)
 
             # Convert the answer (tokens) back to the original text
             answers += [
                 {
-                    'score': score.item(),
-                    'start': np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
-                    'end': np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
-                    'answer': ' '.join(example.doc_tokens[feature.token_to_orig_map[s]:feature.token_to_orig_map[e] + 1])
+                    "score": score.item(),
+                    "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
+                    "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
+                    "answer": " ".join(
+                        example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
+                    ),
                 }
                 for s, e, score in zip(starts, ends, scores)
             ]
@@ -767,71 +831,71 @@ class QuestionAnsweringPipeline(Pipeline):
             chars_idx += len(word) + 1
 
         # Join text with spaces
-        return {'answer': ' '.join(words), 'start': max(0, char_start_idx), 'end': min(len(text), char_end_idx)}
+        return {"answer": " ".join(words), "start": max(0, char_start_idx), "end": min(len(text), char_end_idx)}
 
 
 # Register all the supported task here
 SUPPORTED_TASKS = {
-    'feature-extraction': {
-        'impl': FeatureExtractionPipeline,
-        'tf': TFAutoModel if is_tf_available() else None,
-        'pt': AutoModel if is_torch_available() else None,
-        'default': {
-            'model': {
-                'pt': 'distilbert-base-uncased',
-                'tf': 'distilbert-base-uncased',
-            },
-            'config': None,
-            'tokenizer': 'distilbert-base-uncased'
-        }
+    "feature-extraction": {
+        "impl": FeatureExtractionPipeline,
+        "tf": TFAutoModel if is_tf_available() else None,
+        "pt": AutoModel if is_torch_available() else None,
+        "default": {
+            "model": {"pt": "distilbert-base-uncased", "tf": "distilbert-base-uncased",},
+            "config": None,
+            "tokenizer": "distilbert-base-uncased",
+        },
     },
-    'sentiment-analysis': {
-        'impl': TextClassificationPipeline,
-        'tf': TFAutoModelForSequenceClassification if is_tf_available() else None,
-        'pt': AutoModelForSequenceClassification if is_torch_available() else None,
-        'default': {
-            'model': {
-                'pt': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin',
-                'tf': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5',
+    "sentiment-analysis": {
+        "impl": TextClassificationPipeline,
+        "tf": TFAutoModelForSequenceClassification if is_tf_available() else None,
+        "pt": AutoModelForSequenceClassification if is_torch_available() else None,
+        "default": {
+            "model": {
+                "pt": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin",
+                "tf": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5",
             },
-            'config': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json',
-            'tokenizer': 'distilbert-base-uncased'
-        }
+            "config": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json",
+            "tokenizer": "distilbert-base-uncased",
+        },
     },
-    'ner': {
-        'impl': NerPipeline,
-        'tf': TFAutoModelForTokenClassification if is_tf_available() else None,
-        'pt': AutoModelForTokenClassification if is_torch_available() else None,
-        'default': {
-            'model': {
-                'pt':'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin',
-                'tf': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5',
+    "ner": {
+        "impl": NerPipeline,
+        "tf": TFAutoModelForTokenClassification if is_tf_available() else None,
+        "pt": AutoModelForTokenClassification if is_torch_available() else None,
+        "default": {
+            "model": {
+                "pt": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin",
+                "tf": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5",
             },
-            'config': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json',
-            'tokenizer': 'bert-large-cased'
-        }
+            "config": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json",
+            "tokenizer": "bert-large-cased",
+        },
     },
-    'question-answering': {
-        'impl': QuestionAnsweringPipeline,
-        'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None,
-        'pt': AutoModelForQuestionAnswering if is_torch_available() else None,
-        'default': {
-            'model': {
-                'pt': 'distilbert-base-uncased-distilled-squad',
-                'tf': 'distilbert-base-uncased-distilled-squad',
+    "question-answering": {
+        "impl": QuestionAnsweringPipeline,
+        "tf": TFAutoModelForQuestionAnswering if is_tf_available() else None,
+        "pt": AutoModelForQuestionAnswering if is_torch_available() else None,
+        "default": {
+            "model": {
+                "pt": "distilbert-base-uncased-distilled-squad",
+                "tf": "distilbert-base-uncased-distilled-squad",
             },
-            'config': None,
-            'tokenizer': 'distilbert-base-uncased'
-        }
-    }
+            "config": None,
+            "tokenizer": "distilbert-base-uncased",
+        },
+    },
 }
 
 
-def pipeline(task: str, model: Optional = None,
-             config: Optional[Union[str, PretrainedConfig]] = None,
-             tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
-             modelcard: Optional[Union[str, ModelCard]] = None,
-             **kwargs) -> Pipeline:
+def pipeline(
+    task: str,
+    model: Optional = None,
+    config: Optional[Union[str, PretrainedConfig]] = None,
+    tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
+    modelcard: Optional[Union[str, ModelCard]] = None,
+    **kwargs
+) -> Pipeline:
     """
     Utility factory method to build a pipeline.
     Pipeline are made of:
@@ -852,11 +916,11 @@ def pipeline(task: str, model: Optional = None,
     framework = get_framework(model)
 
     targeted_task = SUPPORTED_TASKS[task]
-    task, model_class = targeted_task['impl'], targeted_task[framework]
+    task, model_class = targeted_task["impl"], targeted_task[framework]
 
     # Use default model/config/tokenizer for the task if no model is provided
     if model is None:
-        models, config, tokenizer = tuple(targeted_task['default'].values())
+        models, config, tokenizer = tuple(targeted_task["default"].values())
         model = models[framework]
 
     # Try to infer tokenizer from model or config name (if provided as str)
@@ -867,8 +931,10 @@ def pipeline(task: str, model: Optional = None,
             tokenizer = config
         else:
             # Impossible to guest what is the right tokenizer here
-            raise Exception("Impossible to guess which tokenizer to use. "
-                            "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer.")
+            raise Exception(
+                "Impossible to guess which tokenizer to use. "
+                "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer."
+            )
 
     # Try to infer modelcard from model or config name (if provided as str)
     if modelcard is None:
@@ -894,14 +960,18 @@ def pipeline(task: str, model: Optional = None,
     if isinstance(model, str):
         # Handle transparent TF/PT model conversion
         model_kwargs = {}
-        if framework == 'pt' and model.endswith('.h5'):
-            model_kwargs['from_tf'] = True
-            logger.warning('Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. '
-                           'Trying to load the model with PyTorch.')
-        elif framework == 'tf' and model.endswith('.bin'):
-            model_kwargs['from_pt'] = True
-            logger.warning('Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. '
-                           'Trying to load the model with Tensorflow.')
+        if framework == "pt" and model.endswith(".h5"):
+            model_kwargs["from_tf"] = True
+            logger.warning(
+                "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
+                "Trying to load the model with PyTorch."
+            )
+        elif framework == "tf" and model.endswith(".bin"):
+            model_kwargs["from_pt"] = True
+            logger.warning(
+                "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
+                "Trying to load the model with Tensorflow."
+            )
         model = model_class.from_pretrained(model, config=config, **model_kwargs)
 
     return task(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, **kwargs)
diff --git a/transformers/tests/configuration_common_test.py b/transformers/tests/configuration_common_test.py
index 376d110d3c..d109a655f8 100644
--- a/transformers/tests/configuration_common_test.py
+++ b/transformers/tests/configuration_common_test.py
@@ -32,10 +32,10 @@ class ConfigTester(object):
 
     def create_and_test_config_common_properties(self):
         config = self.config_class(**self.inputs_dict)
-        self.parent.assertTrue(hasattr(config, 'vocab_size'))
-        self.parent.assertTrue(hasattr(config, 'hidden_size'))
-        self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
-        self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
+        self.parent.assertTrue(hasattr(config, "vocab_size"))
+        self.parent.assertTrue(hasattr(config, "hidden_size"))
+        self.parent.assertTrue(hasattr(config, "num_attention_heads"))
+        self.parent.assertTrue(hasattr(config, "num_hidden_layers"))
 
     def create_and_test_config_to_json_string(self):
         config = self.config_class(**self.inputs_dict)
@@ -68,5 +68,6 @@ class ConfigTester(object):
         self.create_and_test_config_to_json_file()
         self.create_and_test_config_from_and_save_pretrained()
 
+
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/transformers/tests/hf_api_test.py b/transformers/tests/hf_api_test.py
index b45f5aceed..71963df107 100644
--- a/transformers/tests/hf_api_test.py
+++ b/transformers/tests/hf_api_test.py
@@ -28,20 +28,15 @@ PASS = "__DUMMY_TRANSFORMERS_PASS__"
 FILES = [
     (
         "Test-{}.txt".format(int(time.time())),
-        os.path.join(
-            os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
-        )
+        os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"),
     ),
     (
-        "yoyo {}.txt".format(int(time.time())), # space is intentional
-        os.path.join(
-            os.path.dirname(os.path.abspath(__file__)), "fixtures/empty.txt"
-        )
+        "yoyo {}.txt".format(int(time.time())),  # space is intentional
+        os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/empty.txt"),
     ),
 ]
 
 
-
 class HfApiCommonTest(unittest.TestCase):
     _api = HfApi(endpoint="https://moon-staging.huggingface.co")
 
@@ -76,11 +71,9 @@ class HfApiEndpointsTest(HfApiCommonTest):
 
     def test_presign_and_upload(self):
         for FILE_KEY, FILE_PATH in FILES:
-            access_url = self._api.presign_and_upload(
-                token=self._token, filename=FILE_KEY, filepath=FILE_PATH
-            )
+            access_url = self._api.presign_and_upload(token=self._token, filename=FILE_KEY, filepath=FILE_PATH)
             self.assertIsInstance(access_url, six.string_types)
-            with open(FILE_PATH, 'r') as f:
+            with open(FILE_PATH, "r") as f:
                 body = f.read()
             r = requests.get(access_url)
             self.assertEqual(r.text, body)
@@ -93,7 +86,6 @@ class HfApiEndpointsTest(HfApiCommonTest):
             self.assertIsInstance(o, S3Obj)
 
 
-
 class HfFolderTest(unittest.TestCase):
     def test_token_workflow(self):
         """
@@ -102,18 +94,12 @@ class HfFolderTest(unittest.TestCase):
         """
         token = "token-{}".format(int(time.time()))
         HfFolder.save_token(token)
-        self.assertEqual(
-            HfFolder.get_token(),
-            token
-        )
+        self.assertEqual(HfFolder.get_token(), token)
         HfFolder.delete_token()
         HfFolder.delete_token()
         # ^^ not an error, we test that the
         # second call does not fail.
-        self.assertEqual(
-            HfFolder.get_token(),
-            None
-        )
+        self.assertEqual(HfFolder.get_token(), None)
 
 
 if __name__ == "__main__":
diff --git a/transformers/tests/model_card_test.py b/transformers/tests/model_card_test.py
index b293b5726a..30fe33a904 100644
--- a/transformers/tests/model_card_test.py
+++ b/transformers/tests/model_card_test.py
@@ -21,44 +21,39 @@ import unittest
 from transformers.modelcard import ModelCard
 from .tokenization_tests_commons import TemporaryDirectory
 
-class ModelCardTester(unittest.TestCase):
 
+class ModelCardTester(unittest.TestCase):
     def setUp(self):
-        self.inputs_dict = {'model_details': {
-                                'Organization': 'testing',
-                                'Model date': 'today',
-                                'Model version': 'v2.1, Developed by Test Corp in 2019.',
-                                'Architecture': 'Convolutional Neural Network.',
-                                },
-                            'metrics': 'BLEU and ROUGE-1',
-                            'evaluation_data':{
-                                'Datasets':{
-                                    'BLEU': 'My-great-dataset-v1',
-                                    'ROUGE-1': 'My-short-dataset-v2.1',
-                                },
-                                'Preprocessing': 'See details on https://arxiv.org/pdf/1810.03993.pdf'
-                            },
-                            'training_data':{
-                                'Dataset': 'English Wikipedia dump dated 2018-12-01',
-                                'Preprocessing': 'Using SentencePiece vocabulary of size 52k tokens. See details on https://arxiv.org/pdf/1810.03993.pdf'
-                            },
-                            'quantitative_analyses': {
-                                'BLEU': 55.1,
-                                'ROUGE-1': 76,
-                            },
-                            }
+        self.inputs_dict = {
+            "model_details": {
+                "Organization": "testing",
+                "Model date": "today",
+                "Model version": "v2.1, Developed by Test Corp in 2019.",
+                "Architecture": "Convolutional Neural Network.",
+            },
+            "metrics": "BLEU and ROUGE-1",
+            "evaluation_data": {
+                "Datasets": {"BLEU": "My-great-dataset-v1", "ROUGE-1": "My-short-dataset-v2.1",},
+                "Preprocessing": "See details on https://arxiv.org/pdf/1810.03993.pdf",
+            },
+            "training_data": {
+                "Dataset": "English Wikipedia dump dated 2018-12-01",
+                "Preprocessing": "Using SentencePiece vocabulary of size 52k tokens. See details on https://arxiv.org/pdf/1810.03993.pdf",
+            },
+            "quantitative_analyses": {"BLEU": 55.1, "ROUGE-1": 76,},
+        }
 
     def test_model_card_common_properties(self):
         modelcard = ModelCard.from_dict(self.inputs_dict)
-        self.assertTrue(hasattr(modelcard, 'model_details'))
-        self.assertTrue(hasattr(modelcard, 'intended_use'))
-        self.assertTrue(hasattr(modelcard, 'factors'))
-        self.assertTrue(hasattr(modelcard, 'metrics'))
-        self.assertTrue(hasattr(modelcard, 'evaluation_data'))
-        self.assertTrue(hasattr(modelcard, 'training_data'))
-        self.assertTrue(hasattr(modelcard, 'quantitative_analyses'))
-        self.assertTrue(hasattr(modelcard, 'ethical_considerations'))
-        self.assertTrue(hasattr(modelcard, 'caveats_and_recommendations'))
+        self.assertTrue(hasattr(modelcard, "model_details"))
+        self.assertTrue(hasattr(modelcard, "intended_use"))
+        self.assertTrue(hasattr(modelcard, "factors"))
+        self.assertTrue(hasattr(modelcard, "metrics"))
+        self.assertTrue(hasattr(modelcard, "evaluation_data"))
+        self.assertTrue(hasattr(modelcard, "training_data"))
+        self.assertTrue(hasattr(modelcard, "quantitative_analyses"))
+        self.assertTrue(hasattr(modelcard, "ethical_considerations"))
+        self.assertTrue(hasattr(modelcard, "caveats_and_recommendations"))
 
     def test_model_card_to_json_string(self):
         modelcard = ModelCard.from_dict(self.inputs_dict)
@@ -70,7 +65,7 @@ class ModelCardTester(unittest.TestCase):
         model_card_first = ModelCard.from_dict(self.inputs_dict)
 
         with TemporaryDirectory() as tmpdirname:
-            filename = os.path.join(tmpdirname, u"modelcard.json")
+            filename = os.path.join(tmpdirname, "modelcard.json")
             model_card_first.to_json_file(filename)
             model_card_second = ModelCard.from_json_file(filename)
 
@@ -85,5 +80,6 @@ class ModelCardTester(unittest.TestCase):
 
         self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/modeling_albert_test.py b/transformers/tests/modeling_albert_test.py
index b726fd9278..f798af95bc 100644
--- a/transformers/tests/modeling_albert_test.py
+++ b/transformers/tests/modeling_albert_test.py
@@ -20,14 +20,18 @@ import unittest
 
 from transformers import is_torch_available
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 if is_torch_available():
-    from transformers import (AlbertConfig, AlbertModel, AlbertForMaskedLM,
-                              AlbertForSequenceClassification, AlbertForQuestionAnswering,
-                              )
+    from transformers import (
+        AlbertConfig,
+        AlbertModel,
+        AlbertForMaskedLM,
+        AlbertForSequenceClassification,
+        AlbertForQuestionAnswering,
+    )
     from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
@@ -37,33 +41,33 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
     all_model_classes = (AlbertModel, AlbertForMaskedLM) if is_torch_available() else ()
 
     class AlbertModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     embedding_size=16,
-                     hidden_size=36,
-                     num_hidden_layers=6,
-                     num_hidden_groups=6,
-                     num_attention_heads=6,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            embedding_size=16,
+            hidden_size=36,
+            num_hidden_layers=6,
+            num_hidden_groups=6,
+            num_attention_heads=6,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -120,16 +124,17 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
                 max_position_embeddings=self.max_position_embeddings,
                 type_vocab_size=self.type_vocab_size,
                 initializer_range=self.initializer_range,
-                num_hidden_groups=self.num_hidden_groups)
+                num_hidden_groups=self.num_hidden_groups,
+            )
 
             return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+            self.parent.assertListEqual(list(result["loss"].size()), [])
 
-        def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = AlbertModel(config=config)
             model.to(torch_device)
             model.eval()
@@ -142,66 +147,79 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
                 "pooled_output": pooled_output,
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
 
-
-        def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = AlbertForMaskedLM(config=config)
             model.to(torch_device)
             model.eval()
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
+            loss, prediction_scores = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels
+            )
             result = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.check_loss_output(result)
 
-        def create_and_check_albert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = AlbertForQuestionAnswering(config=config)
             model.to(torch_device)
             model.eval()
-            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
-                                                   start_positions=sequence_labels, end_positions=sequence_labels)
+            loss, start_logits, end_logits = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+            )
             result = {
                 "loss": loss,
                 "start_logits": start_logits,
                 "end_logits": end_logits,
             }
-            self.parent.assertListEqual(
-                list(result["start_logits"].size()),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].size()),
-                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
             self.check_loss_output(result)
 
-
-        def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = AlbertForSequenceClassification(config)
             model.to(torch_device)
             model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
+            )
             result = {
                 "loss": loss,
                 "logits": logits,
             }
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.num_labels])
+            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
             self.check_loss_output(result)
 
-
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -233,5 +251,6 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
             model = AlbertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/modeling_auto_test.py b/transformers/tests/modeling_auto_test.py
index 871a262fe8..3bdaa8a378 100644
--- a/transformers/tests/modeling_auto_test.py
+++ b/transformers/tests/modeling_auto_test.py
@@ -25,14 +25,21 @@ from transformers import is_torch_available
 from .utils import require_torch, slow, SMALL_MODEL_IDENTIFIER
 
 if is_torch_available():
-    from transformers import (AutoConfig, BertConfig,
-                                    AutoModel, BertModel,
-                                    AutoModelWithLMHead, BertForMaskedLM,
-                                    AutoModelForSequenceClassification, BertForSequenceClassification,
-                                    AutoModelForQuestionAnswering, BertForQuestionAnswering)
+    from transformers import (
+        AutoConfig,
+        BertConfig,
+        AutoModel,
+        BertModel,
+        AutoModelWithLMHead,
+        BertForMaskedLM,
+        AutoModelForSequenceClassification,
+        BertForSequenceClassification,
+        AutoModelForQuestionAnswering,
+        BertForQuestionAnswering,
+    )
     from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
-    from .modeling_common_test import (CommonTestCases, ids_tensor)
+    from .modeling_common_test import CommonTestCases, ids_tensor
     from .configuration_common_test import ConfigTester
 
 
@@ -75,7 +82,9 @@ class AutoModelTest(unittest.TestCase):
             self.assertIsInstance(config, BertConfig)
 
             model = AutoModelForSequenceClassification.from_pretrained(model_name)
-            model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
+            model, loading_info = AutoModelForSequenceClassification.from_pretrained(
+                model_name, output_loading_info=True
+            )
             self.assertIsNotNone(model)
             self.assertIsInstance(model, BertForSequenceClassification)
 
diff --git a/transformers/tests/modeling_bert_test.py b/transformers/tests/modeling_bert_test.py
index a5adff8f68..6711aded69 100644
--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -20,51 +20,68 @@ import unittest
 
 from transformers import is_torch_available
 
-from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor, floats_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 if is_torch_available():
-    from transformers import (BertConfig, BertModel, BertForMaskedLM,
-                              BertForNextSentencePrediction, BertForPreTraining,
-                              BertForQuestionAnswering, BertForSequenceClassification,
-                              BertForTokenClassification, BertForMultipleChoice)
+    from transformers import (
+        BertConfig,
+        BertModel,
+        BertForMaskedLM,
+        BertForNextSentencePrediction,
+        BertForPreTraining,
+        BertForQuestionAnswering,
+        BertForSequenceClassification,
+        BertForTokenClassification,
+        BertForMultipleChoice,
+    )
     from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
 @require_torch
 class BertModelTest(CommonTestCases.CommonModelTester):
 
-    all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
-                         BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
-                         BertForTokenClassification) if is_torch_available() else ()
+    all_model_classes = (
+        (
+            BertModel,
+            BertForMaskedLM,
+            BertForNextSentencePrediction,
+            BertForPreTraining,
+            BertForQuestionAnswering,
+            BertForSequenceClassification,
+            BertForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
 
     class BertModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -119,25 +136,44 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 max_position_embeddings=self.max_position_embeddings,
                 type_vocab_size=self.type_vocab_size,
                 is_decoder=False,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
             return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
         def prepare_config_and_inputs_for_decoder(self):
-            config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels = self.prepare_config_and_inputs()
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = self.prepare_config_and_inputs()
 
             config.is_decoder = True
             encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
             encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
 
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask
+            return (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+                encoder_hidden_states,
+                encoder_attention_mask,
+            )
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+            self.parent.assertListEqual(list(result["loss"].size()), [])
 
-        def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = BertModel(config=config)
             model.to(torch_device)
             model.eval()
@@ -150,16 +186,38 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 "pooled_output": pooled_output,
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
 
-        def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
+        def create_and_check_bert_model_as_decoder(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ):
             model = BertModel(config)
             model.to(torch_device)
             model.eval()
-            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
-            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
+            sequence_output, pooled_output = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+            sequence_output, pooled_output = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                encoder_hidden_states=encoder_hidden_states,
+            )
             sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
 
             result = {
@@ -167,122 +225,171 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 "pooled_output": pooled_output,
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
 
-        def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = BertForMaskedLM(config=config)
             model.to(torch_device)
             model.eval()
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
+            loss, prediction_scores = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels
+            )
             result = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.check_loss_output(result)
 
-        def create_and_check_bert_model_for_masked_lm_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
+        def create_and_check_bert_model_for_masked_lm_as_decoder(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ):
             model = BertForMaskedLM(config=config)
             model.to(torch_device)
             model.eval()
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states)
+            loss, prediction_scores = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                masked_lm_labels=token_labels,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+            loss, prediction_scores = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                masked_lm_labels=token_labels,
+                encoder_hidden_states=encoder_hidden_states,
+            )
             result = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.check_loss_output(result)
 
-        def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_next_sequence_prediction(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = BertForNextSentencePrediction(config=config)
             model.to(torch_device)
             model.eval()
-            loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
+            loss, seq_relationship_score = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                next_sentence_label=sequence_labels,
+            )
             result = {
                 "loss": loss,
                 "seq_relationship_score": seq_relationship_score,
             }
-            self.parent.assertListEqual(
-                list(result["seq_relationship_score"].size()),
-                [self.batch_size, 2])
+            self.parent.assertListEqual(list(result["seq_relationship_score"].size()), [self.batch_size, 2])
             self.check_loss_output(result)
 
-        def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_pretraining(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = BertForPreTraining(config=config)
             model.to(torch_device)
             model.eval()
-            loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
-                                                                    masked_lm_labels=token_labels, next_sentence_label=sequence_labels)
+            loss, prediction_scores, seq_relationship_score = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                masked_lm_labels=token_labels,
+                next_sentence_label=sequence_labels,
+            )
             result = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
                 "seq_relationship_score": seq_relationship_score,
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
-            self.parent.assertListEqual(
-                list(result["seq_relationship_score"].size()),
-                [self.batch_size, 2])
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
+            self.parent.assertListEqual(list(result["seq_relationship_score"].size()), [self.batch_size, 2])
             self.check_loss_output(result)
 
-        def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = BertForQuestionAnswering(config=config)
             model.to(torch_device)
             model.eval()
-            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
-                                                   start_positions=sequence_labels, end_positions=sequence_labels)
+            loss, start_logits, end_logits = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+            )
             result = {
                 "loss": loss,
                 "start_logits": start_logits,
                 "end_logits": end_logits,
             }
-            self.parent.assertListEqual(
-                list(result["start_logits"].size()),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].size()),
-                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
             self.check_loss_output(result)
 
-        def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = BertForSequenceClassification(config)
             model.to(torch_device)
             model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
+            )
             result = {
                 "loss": loss,
                 "logits": logits,
             }
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.num_labels])
+            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
             self.check_loss_output(result)
 
-        def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = BertForTokenClassification(config=config)
             model.to(torch_device)
             model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
+            )
             result = {
                 "loss": loss,
                 "logits": logits,
             }
             self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.seq_length, self.num_labels])
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
+            )
             self.check_loss_output(result)
 
-        def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_multiple_choice(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_choices = self.num_choices
             model = BertForMultipleChoice(config=config)
             model.to(torch_device)
@@ -290,24 +397,31 @@ class BertModelTest(CommonTestCases.CommonModelTester):
             multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
             multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
             multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            loss, logits = model(multiple_choice_inputs_ids,
-                                 attention_mask=multiple_choice_input_mask,
-                                 token_type_ids=multiple_choice_token_type_ids,
-                                 labels=choice_labels)
+            loss, logits = model(
+                multiple_choice_inputs_ids,
+                attention_mask=multiple_choice_input_mask,
+                token_type_ids=multiple_choice_token_type_ids,
+                labels=choice_labels,
+            )
             result = {
                 "loss": loss,
                 "logits": logits,
             }
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.num_choices])
+            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices])
             self.check_loss_output(result)
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index 2116651f4a..6834c78d15 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -36,34 +36,48 @@ if is_torch_available():
     import torch
     import numpy as np
 
-    from transformers import (AdaptiveEmbedding, PretrainedConfig, PreTrainedModel,
-                                    BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                    GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers import (
+        AdaptiveEmbedding,
+        PretrainedConfig,
+        PreTrainedModel,
+        BertModel,
+        BertConfig,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        GPT2LMHeadModel,
+        GPT2Config,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
 
     class TemporaryDirectory(object):
         """Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
+
         def __enter__(self):
             self.name = tempfile.mkdtemp()
             return self.name
+
         def __exit__(self, exc_type, exc_value, traceback):
             shutil.rmtree(self.name)
+
+
 else:
     import pickle
+
     TemporaryDirectory = tempfile.TemporaryDirectory
     unicode = str
 
+
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
     for key in configs_no_init.__dict__.keys():
-        if '_range' in key or '_std' in key or 'initializer_factor' in key:
+        if "_range" in key or "_std" in key or "initializer_factor" in key:
             setattr(configs_no_init, key, 0.0)
     return configs_no_init
 
-class CommonTestCases:
 
+class CommonTestCases:
     @require_torch
     class CommonModelTester(unittest.TestCase):
 
@@ -108,8 +122,11 @@ class CommonTestCases:
                 model = model_class(config=configs_no_init)
                 for name, param in model.named_parameters():
                     if param.requires_grad:
-                        self.assertIn(param.data.mean().item(), [0.0, 1.0],
-                        msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
+                        self.assertIn(
+                            param.data.mean().item(),
+                            [0.0, 1.0],
+                            msg="Parameter {} of model {} seems not properly initialized".format(name, model_class),
+                        )
 
         def test_determinism(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -131,10 +148,22 @@ class CommonTestCases:
         def test_attention_outputs(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-            decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
-            encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
-            decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
-            encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
+            decoder_seq_length = (
+                self.model_tester.decoder_seq_length
+                if hasattr(self.model_tester, "decoder_seq_length")
+                else self.model_tester.seq_length
+            )
+            encoder_seq_length = (
+                self.model_tester.encoder_seq_length
+                if hasattr(self.model_tester, "encoder_seq_length")
+                else self.model_tester.seq_length
+            )
+            decoder_key_length = (
+                self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length
+            )
+            encoder_key_length = (
+                self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length
+            )
 
             for model_class in self.all_model_classes:
                 config.output_attentions = True
@@ -150,23 +179,20 @@ class CommonTestCases:
                 self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
                 self.assertListEqual(
                     list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads,
-                    encoder_seq_length ,
-                    encoder_key_length])
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
                 out_len = len(outputs)
 
                 if self.is_encoder_decoder:
                     self.assertEqual(out_len % 2, 0)
-                    decoder_attentions = outputs[(out_len // 2)-1]
+                    decoder_attentions = outputs[(out_len // 2) - 1]
                     self.assertEqual(model.config.output_attentions, True)
                     self.assertEqual(model.config.output_hidden_states, False)
                     self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
                     self.assertListEqual(
                         list(decoder_attentions[0].shape[-3:]),
-                        [self.model_tester.num_attention_heads,
-                         decoder_seq_length,
-                         decoder_key_length
-                         ])
+                        [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                    )
 
                 # Check attention is always last and order is fine
                 config.output_attentions = True
@@ -184,9 +210,8 @@ class CommonTestCases:
                 self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
                 self.assertListEqual(
                     list(self_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads,
-                    encoder_seq_length,
-                    encoder_key_length])
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
 
         def test_torchscript(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -215,7 +240,7 @@ class CommonTestCases:
                 model = model_class(config=configs_no_init)
                 model.to(torch_device)
                 model.eval()
-                inputs = inputs_dict['input_ids']  # Let's keep only input_ids
+                inputs = inputs_dict["input_ids"]  # Let's keep only input_ids
 
                 try:
                     traced_gpt2 = torch.jit.trace(model, inputs)
@@ -269,12 +294,14 @@ class CommonTestCases:
 
                 # Prepare head_mask
                 # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
-                head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device)
+                head_mask = torch.ones(
+                    self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device
+                )
                 head_mask[0, 0] = 0
                 head_mask[-1, :-1] = 0
                 head_mask.requires_grad_(requires_grad=True)
                 inputs = inputs_dict.copy()
-                inputs['head_mask'] = head_mask
+                inputs["head_mask"] = head_mask
 
                 outputs = model(**inputs)
 
@@ -289,21 +316,20 @@ class CommonTestCases:
 
                 # Remove Nan
                 for t in attentions:
-                    self.assertLess(torch.sum(torch.isnan(t)), t.numel() / 4)  # Check we don't have more than 25% nans (arbitrary)
-                attentions = [t.masked_fill(torch.isnan(t), 0.0) for t in attentions]  # remove them (the test is less complete)
+                    self.assertLess(
+                        torch.sum(torch.isnan(t)), t.numel() / 4
+                    )  # Check we don't have more than 25% nans (arbitrary)
+                attentions = [
+                    t.masked_fill(torch.isnan(t), 0.0) for t in attentions
+                ]  # remove them (the test is less complete)
 
                 self.assertIsNotNone(multihead_outputs)
                 self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
-                self.assertAlmostEqual(
-                    attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
-                self.assertNotEqual(
-                    attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
-                self.assertNotEqual(
-                    attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
-                self.assertAlmostEqual(
-                    attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
-                self.assertNotEqual(
-                    attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
+                self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
+                self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
 
         def test_head_pruning(self):
             if not self.test_pruning:
@@ -320,20 +346,16 @@ class CommonTestCases:
                 model = model_class(config=config)
                 model.to(torch_device)
                 model.eval()
-                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
-                                -1: [0]}
+                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
                 model.prune_heads(heads_to_prune)
                 with torch.no_grad():
                     outputs = model(**inputs_dict)
 
                 attentions = outputs[-1]
 
-                self.assertEqual(
-                    attentions[0].shape[-3], 1)
-                self.assertEqual(
-                    attentions[1].shape[-3], self.model_tester.num_attention_heads)
-                self.assertEqual(
-                    attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+                self.assertEqual(attentions[0].shape[-3], 1)
+                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+                self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
         def test_head_pruning_save_load_from_pretrained(self):
             if not self.test_pruning:
@@ -350,8 +372,7 @@ class CommonTestCases:
                 model = model_class(config=config)
                 model.to(torch_device)
                 model.eval()
-                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
-                                -1: [0]}
+                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
                 model.prune_heads(heads_to_prune)
 
                 with TemporaryDirectory() as temp_dir_name:
@@ -366,7 +387,6 @@ class CommonTestCases:
                 self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
                 self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
-
         def test_head_pruning_save_load_from_config_init(self):
             if not self.test_pruning:
                 return
@@ -380,8 +400,7 @@ class CommonTestCases:
                 config.output_attentions = True
                 config.output_hidden_states = False
 
-                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
-                                 -1: [0]}
+                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
                 config.pruned_heads = heads_to_prune
 
                 model = model_class(config=config)
@@ -446,7 +465,7 @@ class CommonTestCases:
                     outputs = model(**inputs_dict)
                 attentions = outputs[-1]
 
-                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1)
+                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
                 self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
                 self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
                 self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
@@ -470,8 +489,13 @@ class CommonTestCases:
                 self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
                 self.assertListEqual(
                     list(hidden_states[0].shape[-2:]),
-                    [self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length,
-                     self.model_tester.hidden_size])
+                    [
+                        self.model_tester.encoder_seq_length
+                        if hasattr(self.model_tester, "encoder_seq_length")
+                        else self.model_tester.seq_length,
+                        self.model_tester.hidden_size,
+                    ],
+                )
 
         def test_resize_tokens_embeddings(self):
             original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -512,15 +536,10 @@ class CommonTestCases:
 
             for model_class in self.all_model_classes:
                 model = model_class(config)
-                self.assertIsInstance(
-                    model.get_input_embeddings(),
-                    (torch.nn.Embedding, AdaptiveEmbedding)
-                )
+                self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Embedding, AdaptiveEmbedding))
                 model.set_input_embeddings(torch.nn.Embedding(10, 10))
                 x = model.get_output_embeddings()
-                self.assertTrue(
-                    x is None or isinstance(x, torch.nn.Linear)
-                )
+                self.assertTrue(x is None or isinstance(x, torch.nn.Linear))
 
         def test_tie_model_weights(self):
             if not self.test_torchscript:
@@ -602,30 +621,30 @@ class CommonTestCases:
                     outputs = model(**inputs_dict)
 
     class GPTModelTester(CommonModelTester):
-
-        def __init__(self,
-                        parent,
-                        batch_size=13,
-                        seq_length=7,
-                        is_training=True,
-                        use_position_ids=True,
-                        use_token_type_ids=True,
-                        use_labels=True,
-                        vocab_size=99,
-                        n_positions=33,
-                        hidden_size=32,
-                        num_hidden_layers=5,
-                        num_attention_heads=4,
-                        n_choices=3,
-                        type_sequence_label_size=2,
-                        initializer_range=0.02,
-                        num_labels=3,
-                        scope=None,
-                        config_class=None,
-                        base_model_class=None,
-                        lm_head_model_class=None,
-                        double_head_model_class=None,
-                        ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_position_ids=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            n_positions=33,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            n_choices=3,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            scope=None,
+            config_class=None,
+            base_model_class=None,
+            lm_head_model_class=None,
+            double_head_model_class=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -676,13 +695,14 @@ class CommonTestCases:
                 n_embd=self.hidden_size,
                 n_layer=self.num_hidden_layers,
                 n_head=self.num_attention_heads,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
-            return (config, input_ids, token_type_ids, position_ids,
-                    mc_labels, lm_labels, mc_token_ids)
+            return (config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids)
 
-        def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
-                                mc_labels, lm_labels, mc_token_ids):
+        def create_and_check_base_model(
+            self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
+        ):
             model = self.base_model_class(config)
             model.to(torch_device)
             model.eval()
@@ -694,12 +714,12 @@ class CommonTestCases:
 
             hidden_state = outputs[0]
             self.parent.assertListEqual(
-                list(hidden_state.size()),
-                [self.batch_size, self.n_choices, self.seq_length, self.hidden_size])
+                list(hidden_state.size()), [self.batch_size, self.n_choices, self.seq_length, self.hidden_size]
+            )
 
-
-        def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
-                                        mc_labels, lm_labels, mc_token_ids):
+        def create_and_check_lm_head(
+            self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
+        ):
             model = self.lm_head_model_class(config)
             model.to(torch_device)
             model.eval()
@@ -709,14 +729,13 @@ class CommonTestCases:
 
             total_voc = self.vocab_size
             self.parent.assertListEqual(
-                list(lm_logits.size()),
-                [self.batch_size, self.n_choices, self.seq_length, total_voc])
-            self.parent.assertListEqual(
-                list(loss.size()),
-                [])
+                list(lm_logits.size()), [self.batch_size, self.n_choices, self.seq_length, total_voc]
+            )
+            self.parent.assertListEqual(list(loss.size()), [])
 
-        def create_and_check_presents(self, config, input_ids, token_type_ids, position_ids,
-                                        mc_labels, lm_labels, mc_token_ids):
+        def create_and_check_presents(
+            self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
+        ):
             for model_class in self.all_model_classes:
                 model = model_class(config)
                 model.to(torch_device)
@@ -727,30 +746,39 @@ class CommonTestCases:
                 self.parent.assertEqual(self.num_hidden_layers, len(presents))
                 self.parent.assertListEqual(
                     list(presents[0].size()),
-                    [2, self.batch_size * self.n_choices, self.num_attention_heads,
-                        self.seq_length, self.hidden_size // self.num_attention_heads])
+                    [
+                        2,
+                        self.batch_size * self.n_choices,
+                        self.num_attention_heads,
+                        self.seq_length,
+                        self.hidden_size // self.num_attention_heads,
+                    ],
+                )
 
-        def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
-                                        mc_labels, lm_labels, mc_token_ids):
+        def create_and_check_double_heads(
+            self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
+        ):
             model = self.double_head_model_class(config)
             model.to(torch_device)
             model.eval()
             with torch.no_grad():
-                outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
-                            token_type_ids=token_type_ids, position_ids=position_ids)
+                outputs = model(
+                    input_ids,
+                    mc_token_ids,
+                    lm_labels=lm_labels,
+                    mc_labels=mc_labels,
+                    token_type_ids=token_type_ids,
+                    position_ids=position_ids,
+                )
             lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
             loss = [lm_loss, mc_loss]
 
             total_voc = self.vocab_size
             self.parent.assertListEqual(
-                list(lm_logits.size()),
-                [self.batch_size, self.n_choices, self.seq_length, total_voc])
-            self.parent.assertListEqual(
-                list(mc_logits.size()),
-                [self.batch_size, self.n_choices])
-            self.parent.assertListEqual(
-                [list(l.size()) for l in loss],
-                [[], []])
+                list(lm_logits.size()), [self.batch_size, self.n_choices, self.seq_length, total_voc]
+            )
+            self.parent.assertListEqual(list(mc_logits.size()), [self.batch_size, self.n_choices])
+            self.parent.assertListEqual([list(l.size()) for l in loss], [[], []])
 
         def create_and_check_model_from_pretrained(self):
             for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
@@ -759,9 +787,8 @@ class CommonTestCases:
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, position_ids,
-                mc_labels, lm_labels, mc_token_ids) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids}
+            (config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids}
             return config, inputs_dict
 
         def run_common_tests(self, test_presents=False):
@@ -791,10 +818,10 @@ class ConfigTester(object):
 
     def create_and_test_config_common_properties(self):
         config = self.config_class(**self.inputs_dict)
-        self.parent.assertTrue(hasattr(config, 'vocab_size'))
-        self.parent.assertTrue(hasattr(config, 'hidden_size'))
-        self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
-        self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
+        self.parent.assertTrue(hasattr(config, "vocab_size"))
+        self.parent.assertTrue(hasattr(config, "hidden_size"))
+        self.parent.assertTrue(hasattr(config, "num_attention_heads"))
+        self.parent.assertTrue(hasattr(config, "num_hidden_layers"))
 
     def create_and_test_config_to_json_string(self):
         config = self.config_class(**self.inputs_dict)
diff --git a/transformers/tests/modeling_ctrl_test.py b/transformers/tests/modeling_ctrl_test.py
index ed0d62d1e6..9b71b1dd50 100644
--- a/transformers/tests/modeling_ctrl_test.py
+++ b/transformers/tests/modeling_ctrl_test.py
@@ -21,10 +21,9 @@ import pdb
 from transformers import is_torch_available
 
 if is_torch_available():
-    from transformers import (CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                    CTRLLMHeadModel)
+    from transformers import CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRLLMHeadModel
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
@@ -39,32 +38,32 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
     test_head_masking = False
 
     class CTRLModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_token_type_ids=True,
-                     use_input_mask=True,
-                     use_labels=True,
-                     use_mc_token_ids=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_token_type_ids=True,
+            use_input_mask=True,
+            use_labels=True,
+            use_mc_token_ids=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -129,12 +128,20 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
 
             head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
 
-            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+            return (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            )
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+            self.parent.assertListEqual(list(result["loss"].size()), [])
 
         def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = CTRLModel(config=config)
@@ -150,8 +157,8 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
                 "presents": presents,
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertEqual(len(result["presents"]), config.n_layer)
 
         def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
@@ -161,29 +168,28 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
 
             loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
 
-            result = {
-                "loss": loss,
-                "lm_logits": lm_logits
-            }
+            result = {"loss": loss, "lm_logits": lm_logits}
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
-
+                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
 
-            (config, input_ids, input_mask, head_mask, token_type_ids,
-             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
 
-            inputs_dict = {
-                'input_ids': input_ids,
-                'token_type_ids': token_type_ids,
-                'head_mask': head_mask
-            }
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
 
             return config, inputs_dict
 
diff --git a/transformers/tests/modeling_distilbert_test.py b/transformers/tests/modeling_distilbert_test.py
index ac6f5d248e..5b4f4683de 100644
--- a/transformers/tests/modeling_distilbert_test.py
+++ b/transformers/tests/modeling_distilbert_test.py
@@ -21,11 +21,16 @@ import unittest
 from transformers import is_torch_available
 
 if is_torch_available():
-    from transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
-                                    DistilBertForTokenClassification,
-                                    DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
+    from transformers import (
+        DistilBertConfig,
+        DistilBertModel,
+        DistilBertForMaskedLM,
+        DistilBertForTokenClassification,
+        DistilBertForQuestionAnswering,
+        DistilBertForSequenceClassification,
+    )
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
@@ -33,39 +38,42 @@ from .utils import CACHE_DIR, require_torch, slow, torch_device
 @require_torch
 class DistilBertModelTest(CommonTestCases.CommonModelTester):
 
-    all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering,
-                         DistilBertForSequenceClassification) if is_torch_available() else None
+    all_model_classes = (
+        (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
+        if is_torch_available()
+        else None
+    )
     test_pruning = True
     test_torchscript = True
     test_resize_embeddings = True
     test_head_masking = True
 
     class DistilBertModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=False,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=False,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -114,16 +122,17 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                 dropout=self.hidden_dropout_prob,
                 attention_dropout=self.attention_probs_dropout_prob,
                 max_position_embeddings=self.max_position_embeddings,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
             return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+            self.parent.assertListEqual(list(result["loss"].size()), [])
 
-        def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_model(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = DistilBertModel(config=config)
             model.to(torch_device)
             model.eval()
@@ -134,10 +143,12 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                 "sequence_output": sequence_output,
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
 
-        def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_masked_lm(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = DistilBertForMaskedLM(config=config)
             model.to(torch_device)
             model.eval()
@@ -147,29 +158,31 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                 "prediction_scores": prediction_scores,
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.check_loss_output(result)
 
-        def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_question_answering(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = DistilBertForQuestionAnswering(config=config)
             model.to(torch_device)
             model.eval()
-            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels)
+            loss, start_logits, end_logits = model(
+                input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels
+            )
             result = {
                 "loss": loss,
                 "start_logits": start_logits,
                 "end_logits": end_logits,
             }
-            self.parent.assertListEqual(
-                list(result["start_logits"].size()),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].size()),
-                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
             self.check_loss_output(result)
 
-        def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_sequence_classification(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = DistilBertForSequenceClassification(config)
             model.to(torch_device)
@@ -179,12 +192,12 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                 "loss": loss,
                 "logits": logits,
             }
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.num_labels])
+            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
             self.check_loss_output(result)
 
-        def create_and_check_distilbert_for_token_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_token_classification(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = DistilBertForTokenClassification(config=config)
             model.to(torch_device)
@@ -196,14 +209,14 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                 "logits": logits,
             }
             self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.seq_length, self.num_labels])
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
+            )
             self.check_loss_output(result)
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
             (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'attention_mask': input_mask}
+            inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -239,5 +252,6 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
     #         model = DistilBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
     #         self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/modeling_encoder_decoder_test.py b/transformers/tests/modeling_encoder_decoder_test.py
index 64e86df8f5..491c502bac 100644
--- a/transformers/tests/modeling_encoder_decoder_test.py
+++ b/transformers/tests/modeling_encoder_decoder_test.py
@@ -39,13 +39,13 @@ class EncoderDecoderModelTest(unittest.TestCase):
     def test_model2model_from_pretrained_not_bert(self):
         logging.basicConfig(level=logging.INFO)
         with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained('roberta')
+            _ = Model2Model.from_pretrained("roberta")
 
         with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained('distilbert')
+            _ = Model2Model.from_pretrained("distilbert")
 
         with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained('does-not-exist')
+            _ = Model2Model.from_pretrained("does-not-exist")
 
 
 if __name__ == "__main__":
diff --git a/transformers/tests/modeling_gpt2_test.py b/transformers/tests/modeling_gpt2_test.py
index ad2ec1fd91..2706166b33 100644
--- a/transformers/tests/modeling_gpt2_test.py
+++ b/transformers/tests/modeling_gpt2_test.py
@@ -21,10 +21,15 @@ import unittest
 from transformers import is_torch_available
 
 if is_torch_available():
-    from transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                    GPT2LMHeadModel, GPT2DoubleHeadsModel)
+    from transformers import (
+        GPT2Config,
+        GPT2Model,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+        GPT2LMHeadModel,
+        GPT2DoubleHeadsModel,
+    )
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
@@ -35,32 +40,32 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
     all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
 
     class GPT2ModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_token_type_ids=True,
-                     use_input_mask=True,
-                     use_labels=True,
-                     use_mc_token_ids=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_token_type_ids=True,
+            use_input_mask=True,
+            use_labels=True,
+            use_mc_token_ids=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -125,12 +130,20 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
 
             head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
 
-            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+            return (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            )
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+            self.parent.assertListEqual(list(result["loss"].size()), [])
 
         def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = GPT2Model(config=config)
@@ -146,8 +159,8 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                 "presents": presents,
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertEqual(len(result["presents"]), config.n_layer)
 
         def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
@@ -157,63 +170,58 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
 
             loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
 
-            result = {
-                "loss": loss,
-                "lm_logits": lm_logits
-            }
+            result = {"loss": loss, "lm_logits": lm_logits}
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
-        def create_and_check_double_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
+        def create_and_check_double_lm_head_model(
+            self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
+        ):
             model = GPT2DoubleHeadsModel(config)
             model.to(torch_device)
             model.eval()
 
-
             multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
             multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
             multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
 
-            inputs = {'input_ids': multiple_choice_inputs_ids,
-                      'mc_token_ids': mc_token_ids,
-                      'attention_mask': multiple_choice_input_mask,
-                      'token_type_ids': multiple_choice_token_type_ids,
-                      'lm_labels': multiple_choice_inputs_ids}
+            inputs = {
+                "input_ids": multiple_choice_inputs_ids,
+                "mc_token_ids": mc_token_ids,
+                "attention_mask": multiple_choice_input_mask,
+                "token_type_ids": multiple_choice_token_type_ids,
+                "lm_labels": multiple_choice_inputs_ids,
+            }
 
             loss, lm_logits, mc_logits, _ = model(**inputs)
 
-            result = {
-                "loss": loss,
-                "lm_logits": lm_logits,
-                "mc_logits": mc_logits
-            }
+            result = {"loss": loss, "lm_logits": lm_logits, "mc_logits": mc_logits}
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.num_choices, self.seq_length, self.vocab_size])
-            self.parent.assertListEqual(
-                list(result["mc_logits"].size()),
-                [self.batch_size, self.num_choices])
+                list(result["lm_logits"].size()), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
+            )
+            self.parent.assertListEqual(list(result["mc_logits"].size()), [self.batch_size, self.num_choices])
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
 
-            (config, input_ids, input_mask, head_mask, token_type_ids,
-             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
 
-            inputs_dict = {
-                'input_ids': input_ids,
-                'token_type_ids': token_type_ids,
-                'head_mask': head_mask
-            }
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
 
             return config, inputs_dict
 
diff --git a/transformers/tests/modeling_openai_test.py b/transformers/tests/modeling_openai_test.py
index 1880febcae..f22a0b760c 100644
--- a/transformers/tests/modeling_openai_test.py
+++ b/transformers/tests/modeling_openai_test.py
@@ -21,10 +21,15 @@ import unittest
 from transformers import is_torch_available
 
 if is_torch_available():
-    from transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                    OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
+    from transformers import (
+        OpenAIGPTConfig,
+        OpenAIGPTModel,
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        OpenAIGPTLMHeadModel,
+        OpenAIGPTDoubleHeadsModel,
+    )
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
@@ -32,33 +37,35 @@ from .utils import CACHE_DIR, require_torch, slow, torch_device
 @require_torch
 class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
 
-    all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
+    all_model_classes = (
+        (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
+    )
 
     class OpenAIGPTModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -116,9 +123,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
             return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+            self.parent.assertListEqual(list(result["loss"].size()), [])
 
         def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
             model = OpenAIGPTModel(config=config)
@@ -129,12 +134,10 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
             model(input_ids, token_type_ids=token_type_ids)
             (sequence_output,) = model(input_ids)
 
-            result = {
-                "sequence_output": sequence_output
-            }
+            result = {"sequence_output": sequence_output}
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
 
         def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
             model = OpenAIGPTLMHeadModel(config)
@@ -143,17 +146,12 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
 
             loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
 
-            result = {
-                "loss": loss,
-                "lm_logits": lm_logits
-            }
+            result = {"loss": loss, "lm_logits": lm_logits}
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
         def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
             model = OpenAIGPTDoubleHeadsModel(config)
@@ -162,26 +160,25 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
 
             loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
 
-            result = {
-                "loss": loss,
-                "lm_logits": lm_logits
-            }
+            result = {"loss": loss, "lm_logits": lm_logits}
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {
-                'input_ids': input_ids,
-                'token_type_ids': token_type_ids,
-                'head_mask': head_mask
-            }
+            (
+                config,
+                input_ids,
+                head_mask,
+                token_type_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
 
             return config, inputs_dict
 
diff --git a/transformers/tests/modeling_roberta_test.py b/transformers/tests/modeling_roberta_test.py
index 732e589cdf..451dafe08e 100644
--- a/transformers/tests/modeling_roberta_test.py
+++ b/transformers/tests/modeling_roberta_test.py
@@ -22,12 +22,17 @@ from transformers import is_torch_available
 
 if is_torch_available():
     import torch
-    from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM,
-                              RobertaForSequenceClassification, RobertaForTokenClassification)
+    from transformers import (
+        RobertaConfig,
+        RobertaModel,
+        RobertaForMaskedLM,
+        RobertaForSequenceClassification,
+        RobertaForTokenClassification,
+    )
     from transformers.modeling_roberta import RobertaEmbeddings
     from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
@@ -38,31 +43,31 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
     all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else ()
 
     class RobertaModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -116,17 +121,17 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
                 attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                 max_position_embeddings=self.max_position_embeddings,
                 type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
             return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+            self.parent.assertListEqual(list(result["loss"].size()), [])
 
-        def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
-                                           token_labels, choice_labels):
+        def create_and_check_roberta_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = RobertaModel(config=config)
             model.to(torch_device)
             model.eval()
@@ -139,47 +144,59 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
                 "pooled_output": pooled_output,
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
 
-        def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
-                                                   token_labels, choice_labels):
+        def create_and_check_roberta_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = RobertaForMaskedLM(config=config)
             model.to(torch_device)
             model.eval()
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
+            loss, prediction_scores = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels
+            )
             result = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.check_loss_output(result)
 
-        def create_and_check_roberta_for_token_classification(self, config, input_ids, token_type_ids, input_mask,
-                                                              sequence_labels, token_labels, choice_labels):
+        def create_and_check_roberta_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = RobertaForTokenClassification(config=config)
             model.to(torch_device)
             model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
-                                 labels=token_labels)
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
+            )
             result = {
                 "loss": loss,
                 "logits": logits,
             }
             self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.seq_length, self.num_labels])
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
+            )
             self.check_loss_output(result)
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -214,18 +231,12 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
         model = RobertaEmbeddings(config=config)
 
         input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
-        expected_positions = torch.as_tensor([[
-            0 + model.padding_idx + 1,
-            1 + model.padding_idx + 1,
-            2 + model.padding_idx + 1,
-            model.padding_idx
-        ]])
+        expected_positions = torch.as_tensor(
+            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
+        )
 
         position_ids = model.create_position_ids_from_input_ids(input_ids)
-        self.assertEqual(
-            position_ids.shape,
-            expected_positions.shape
-        )
+        self.assertEqual(position_ids.shape, expected_positions.shape)
         self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
 
     def test_create_position_ids_from_inputs_embeds(self):
@@ -247,69 +258,47 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
         ]
         expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
         position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
-        self.assertEqual(
-            position_ids.shape,
-            expected_positions.shape
-        )
-        self.assertTrue(
-            torch.all(torch.eq(position_ids, expected_positions))
-        )
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
 
 
 class RobertaModelIntegrationTest(unittest.TestCase):
-
     @slow
     def test_inference_masked_lm(self):
-        model = RobertaForMaskedLM.from_pretrained('roberta-base')
+        model = RobertaForMaskedLM.from_pretrained("roberta-base")
 
-        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         output = model(input_ids)[0]
         expected_shape = torch.Size((1, 11, 50265))
-        self.assertEqual(
-            output.shape,
-            expected_shape
-        )
+        self.assertEqual(output.shape, expected_shape)
         # compare the actual values for a slice.
         expected_slice = torch.Tensor(
-            [[[33.8843, -4.3107, 22.7779],
-              [ 4.6533, -2.8099, 13.6252],
-              [ 1.8222, -3.6898,  8.8600]]]
-        )
-        self.assertTrue(
-            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
+            [[[33.8843, -4.3107, 22.7779], [4.6533, -2.8099, 13.6252], [1.8222, -3.6898, 8.8600]]]
         )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
 
     @slow
     def test_inference_no_head(self):
-        model = RobertaModel.from_pretrained('roberta-base')
+        model = RobertaModel.from_pretrained("roberta-base")
 
-        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         output = model(input_ids)[0]
         # compare the actual values for a slice.
         expected_slice = torch.Tensor(
-            [[[-0.0231,  0.0782,  0.0074],
-              [-0.1854,  0.0539, -0.0174],
-              [ 0.0548,  0.0799,  0.1687]]]
-        )
-        self.assertTrue(
-            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
+            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0539, -0.0174], [0.0548, 0.0799, 0.1687]]]
         )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
 
     @slow
     def test_inference_classification_head(self):
-        model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
+        model = RobertaForSequenceClassification.from_pretrained("roberta-large-mnli")
 
-        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         output = model(input_ids)[0]
         expected_shape = torch.Size((1, 3))
-        self.assertEqual(
-            output.shape,
-            expected_shape
-        )
-        expected_tensor = torch.Tensor([[-0.9469,  0.3913,  0.5118]])
-        self.assertTrue(
-            torch.allclose(output, expected_tensor, atol=1e-3)
-        )
+        self.assertEqual(output.shape, expected_shape)
+        expected_tensor = torch.Tensor([[-0.9469, 0.3913, 0.5118]])
+        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-3))
 
 
 if __name__ == "__main__":
diff --git a/transformers/tests/modeling_t5_test.py b/transformers/tests/modeling_t5_test.py
index 9fd9a4b304..3feb61a622 100644
--- a/transformers/tests/modeling_t5_test.py
+++ b/transformers/tests/modeling_t5_test.py
@@ -20,12 +20,12 @@ import unittest
 
 from transformers import is_torch_available
 
-from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor, floats_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 if is_torch_available():
-    from transformers import (T5Config, T5Model, T5WithLMHeadModel)
+    from transformers import T5Config, T5Model, T5WithLMHeadModel
     from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
@@ -39,26 +39,26 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
     is_encoder_decoder = True
 
     class T5ModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     encoder_seq_length=7,
-                     decoder_seq_length=9,
-                     is_training=True,
-                     use_attention_mask=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     n_positions=14,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     d_ff=37,
-                     relative_attention_num_buckets=8,
-                     dropout_rate=0.1,
-                     initializer_factor=0.002,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            encoder_seq_length=7,
+            decoder_seq_length=9,
+            is_training=True,
+            use_attention_mask=True,
+            use_labels=True,
+            vocab_size=99,
+            n_positions=14,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            d_ff=37,
+            relative_attention_num_buckets=8,
+            dropout_rate=0.1,
+            initializer_factor=0.002,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.encoder_seq_length = encoder_seq_length
@@ -101,60 +101,96 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
                 num_heads=self.num_attention_heads,
                 relative_attention_num_buckets=self.relative_attention_num_buckets,
                 dropout_rate=self.dropout_rate,
-                initializer_factor=self.initializer_factor)
+                initializer_factor=self.initializer_factor,
+            )
 
-            return (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels)
+            return (
+                config,
+                encoder_input_ids,
+                decoder_input_ids,
+                encoder_attention_mask,
+                decoder_attention_mask,
+                decoder_lm_labels,
+            )
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+            self.parent.assertListEqual(list(result["loss"].size()), [])
 
-        def create_and_check_t5_model(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
+        def create_and_check_t5_model(
+            self,
+            config,
+            encoder_input_ids,
+            decoder_input_ids,
+            encoder_attention_mask,
+            decoder_attention_mask,
+            decoder_lm_labels,
+        ):
             model = T5Model(config=config)
             model.eval()
-            decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
-                                                   decoder_input_ids=decoder_input_ids,
-                                                   encoder_attention_mask=encoder_attention_mask,
-                                                   decoder_attention_mask=decoder_attention_mask)
-            decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
-                                                   decoder_input_ids=decoder_input_ids)
+            decoder_output, encoder_output = model(
+                encoder_input_ids=encoder_input_ids,
+                decoder_input_ids=decoder_input_ids,
+                encoder_attention_mask=encoder_attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+            decoder_output, encoder_output = model(
+                encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids
+            )
 
             result = {
                 "encoder_output": encoder_output,
                 "decoder_output": decoder_output,
             }
             self.parent.assertListEqual(
-                list(result["encoder_output"].size()),
-                [self.batch_size, self.encoder_seq_length, self.hidden_size])
+                list(result["encoder_output"].size()), [self.batch_size, self.encoder_seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(
-                list(result["decoder_output"].size()),
-                [self.batch_size, self.decoder_seq_length, self.hidden_size])
+                list(result["decoder_output"].size()), [self.batch_size, self.decoder_seq_length, self.hidden_size]
+            )
 
-
-        def create_and_check_t5_with_lm_head(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
+        def create_and_check_t5_with_lm_head(
+            self,
+            config,
+            encoder_input_ids,
+            decoder_input_ids,
+            encoder_attention_mask,
+            decoder_attention_mask,
+            decoder_lm_labels,
+        ):
             model = T5WithLMHeadModel(config=config)
             model.eval()
-            outputs = model(encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids,
-                            decoder_attention_mask=decoder_attention_mask, decoder_lm_labels=decoder_lm_labels)
+            outputs = model(
+                encoder_input_ids=encoder_input_ids,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                decoder_lm_labels=decoder_lm_labels,
+            )
             loss, prediction_scores = outputs[0], outputs[1]
             result = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
-                [self.batch_size, self.decoder_seq_length, self.vocab_size])
+                list(result["prediction_scores"].size()), [self.batch_size, self.decoder_seq_length, self.vocab_size]
+            )
             self.check_loss_output(result)
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask,
-             decoder_attention_mask, decoder_lm_labels) = config_and_inputs
-            inputs_dict = {'encoder_input_ids': encoder_input_ids,
-                           'decoder_input_ids': decoder_input_ids,
-                           'decoder_attention_mask': decoder_attention_mask,
-                           'encoder_attention_mask': encoder_attention_mask}
+            (
+                config,
+                encoder_input_ids,
+                decoder_input_ids,
+                encoder_attention_mask,
+                decoder_attention_mask,
+                decoder_lm_labels,
+            ) = config_and_inputs
+            inputs_dict = {
+                "encoder_input_ids": encoder_input_ids,
+                "decoder_input_ids": decoder_input_ids,
+                "decoder_attention_mask": decoder_attention_mask,
+                "encoder_attention_mask": encoder_attention_mask,
+            }
             return config, inputs_dict
 
     def setUp(self):
@@ -178,5 +214,6 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
             model = T5Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/modeling_tf_albert_test.py b/transformers/tests/modeling_tf_albert_test.py
index 374417cfe2..0406592d54 100644
--- a/transformers/tests/modeling_tf_albert_test.py
+++ b/transformers/tests/modeling_tf_albert_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 import unittest
 import sys
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -27,47 +27,48 @@ from transformers import AlbertConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_albert import (TFAlbertModel, TFAlbertForMaskedLM,
-                                                 TFAlbertForSequenceClassification,
-                                                 TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_albert import (
+        TFAlbertModel,
+        TFAlbertForMaskedLM,
+        TFAlbertForSequenceClassification,
+        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
 
 @require_tf
 class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (
-        TFAlbertModel,
-        TFAlbertForMaskedLM,
-        TFAlbertForSequenceClassification
-    ) if is_tf_available() else ()
+        (TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification) if is_tf_available() else ()
+    )
 
     class TFAlbertModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     embedding_size=16,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            embedding_size=16,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -93,27 +94,22 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
             self.scope = scope
 
         def prepare_config_and_inputs(self):
-            input_ids = ids_tensor(
-                [self.batch_size, self.seq_length], self.vocab_size)
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
             input_mask = None
             if self.use_input_mask:
-                input_mask = ids_tensor(
-                    [self.batch_size, self.seq_length], vocab_size=2)
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
 
             token_type_ids = None
             if self.use_token_type_ids:
-                token_type_ids = ids_tensor(
-                    [self.batch_size, self.seq_length], self.type_vocab_size)
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
 
             sequence_labels = None
             token_labels = None
             choice_labels = None
             if self.use_labels:
-                sequence_labels = ids_tensor(
-                    [self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor(
-                    [self.batch_size, self.seq_length], self.num_labels)
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = AlbertConfig(
@@ -127,19 +123,20 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
                 attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                 max_position_embeddings=self.max_position_embeddings,
                 type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
             return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
-        def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFAlbertModel(config=config)
             # inputs = {'input_ids': input_ids,
             #           'attention_mask': input_mask,
             #           'token_type_ids': token_type_ids}
             # sequence_output, pooled_output = model(**inputs)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             sequence_output, pooled_output = model(inputs)
 
             inputs = [input_ids, input_mask]
@@ -152,50 +149,52 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
                 "pooled_output": pooled_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
-            self.parent.assertListEqual(list(result["pooled_output"].shape), [
-                                        self.batch_size, self.hidden_size])
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
+            self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
 
-        def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFAlbertForMaskedLM(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            prediction_scores, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (prediction_scores,) = model(inputs)
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
-        def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = TFAlbertForSequenceClassification(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            logits, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (logits,) = model(inputs)
             result = {
                 "logits": logits.numpy(),
             }
-            self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.num_labels])
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids,
-                           'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
         self.model_tester = TFAlbertModelTest.TFAlbertModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=AlbertConfig, hidden_size=37)
+        self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -206,13 +205,11 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
 
     def test_for_masked_lm(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_for_masked_lm(
-            *config_and_inputs)
+        self.model_tester.create_and_check_albert_for_masked_lm(*config_and_inputs)
 
     def test_for_sequence_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_for_sequence_classification(
-            *config_and_inputs)
+        self.model_tester.create_and_check_albert_for_sequence_classification(*config_and_inputs)
 
     @slow
     def test_model_from_pretrained(self):
diff --git a/transformers/tests/modeling_tf_auto_test.py b/transformers/tests/modeling_tf_auto_test.py
index 2ad39ddccf..d695474ecf 100644
--- a/transformers/tests/modeling_tf_auto_test.py
+++ b/transformers/tests/modeling_tf_auto_test.py
@@ -25,14 +25,21 @@ from transformers import is_tf_available
 from .utils import require_tf, slow, SMALL_MODEL_IDENTIFIER
 
 if is_tf_available():
-    from transformers import (AutoConfig, BertConfig,
-                                      TFAutoModel, TFBertModel,
-                                      TFAutoModelWithLMHead, TFBertForMaskedLM,
-                                      TFAutoModelForSequenceClassification, TFBertForSequenceClassification,
-                                      TFAutoModelForQuestionAnswering, TFBertForQuestionAnswering)
+    from transformers import (
+        AutoConfig,
+        BertConfig,
+        TFAutoModel,
+        TFBertModel,
+        TFAutoModelWithLMHead,
+        TFBertForMaskedLM,
+        TFAutoModelForSequenceClassification,
+        TFBertForSequenceClassification,
+        TFAutoModelForQuestionAnswering,
+        TFBertForQuestionAnswering,
+    )
     from transformers.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
-    from .modeling_common_test import (CommonTestCases, ids_tensor)
+    from .modeling_common_test import CommonTestCases, ids_tensor
     from .configuration_common_test import ConfigTester
 
 
@@ -41,11 +48,12 @@ class TFAutoModelTest(unittest.TestCase):
     @slow
     def test_model_from_pretrained(self):
         import h5py
+
         self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
 
         logging.basicConfig(level=logging.INFO)
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ['bert-base-uncased']:
+        for model_name in ["bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
@@ -58,7 +66,7 @@ class TFAutoModelTest(unittest.TestCase):
     def test_lmhead_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ['bert-base-uncased']:
+        for model_name in ["bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
@@ -71,7 +79,7 @@ class TFAutoModelTest(unittest.TestCase):
     def test_sequence_classification_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ['bert-base-uncased']:
+        for model_name in ["bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
@@ -84,7 +92,7 @@ class TFAutoModelTest(unittest.TestCase):
     def test_question_answering_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ['bert-base-uncased']:
+        for model_name in ["bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
diff --git a/transformers/tests/modeling_tf_bert_test.py b/transformers/tests/modeling_tf_bert_test.py
index abf20b1514..e36e3a2c3f 100644
--- a/transformers/tests/modeling_tf_bert_test.py
+++ b/transformers/tests/modeling_tf_bert_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 import unittest
 import sys
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -27,49 +27,62 @@ from transformers import BertConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_bert import (TFBertModel, TFBertForMaskedLM,
-                                                       TFBertForNextSentencePrediction,
-                                                       TFBertForPreTraining,
-                                                       TFBertForSequenceClassification,
-                                                       TFBertForMultipleChoice,
-                                                       TFBertForTokenClassification,
-                                                       TFBertForQuestionAnswering,
-                                                       TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_bert import (
+        TFBertModel,
+        TFBertForMaskedLM,
+        TFBertForNextSentencePrediction,
+        TFBertForPreTraining,
+        TFBertForSequenceClassification,
+        TFBertForMultipleChoice,
+        TFBertForTokenClassification,
+        TFBertForQuestionAnswering,
+        TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
 
 @require_tf
 class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
 
-    all_model_classes = (TFBertModel, TFBertForMaskedLM, TFBertForNextSentencePrediction,
-                         TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification,
-                         TFBertForTokenClassification) if is_tf_available() else ()
+    all_model_classes = (
+        (
+            TFBertModel,
+            TFBertForMaskedLM,
+            TFBertForNextSentencePrediction,
+            TFBertForPreTraining,
+            TFBertForQuestionAnswering,
+            TFBertForSequenceClassification,
+            TFBertForTokenClassification,
+        )
+        if is_tf_available()
+        else ()
+    )
 
     class TFBertModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -123,15 +136,16 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
                 attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                 max_position_embeddings=self.max_position_embeddings,
                 type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
             return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
-        def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFBertModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             sequence_output, pooled_output = model(inputs)
 
             inputs = [input_ids, input_mask]
@@ -144,128 +158,119 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
                 "pooled_output": pooled_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
 
-
-        def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFBertForMaskedLM(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            prediction_scores, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (prediction_scores,) = model(inputs)
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
-
-        def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_next_sequence_prediction(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFBertForNextSentencePrediction(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            seq_relationship_score, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (seq_relationship_score,) = model(inputs)
             result = {
                 "seq_relationship_score": seq_relationship_score.numpy(),
             }
-            self.parent.assertListEqual(
-                list(result["seq_relationship_score"].shape),
-                [self.batch_size, 2])
+            self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
 
-
-        def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_pretraining(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFBertForPreTraining(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             prediction_scores, seq_relationship_score = model(inputs)
             result = {
                 "prediction_scores": prediction_scores.numpy(),
                 "seq_relationship_score": seq_relationship_score.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
-            self.parent.assertListEqual(
-                list(result["seq_relationship_score"].shape),
-                [self.batch_size, 2])
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
+            self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
 
-
-        def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = TFBertForSequenceClassification(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            logits, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (logits,) = model(inputs)
             result = {
                 "logits": logits.numpy(),
             }
-            self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.num_labels])
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
 
-
-        def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_multiple_choice(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_choices = self.num_choices
             model = TFBertForMultipleChoice(config=config)
             multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
             multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
             multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-            inputs = {'input_ids': multiple_choice_inputs_ids,
-                      'attention_mask': multiple_choice_input_mask,
-                      'token_type_ids': multiple_choice_token_type_ids}
-            logits, = model(inputs)
+            inputs = {
+                "input_ids": multiple_choice_inputs_ids,
+                "attention_mask": multiple_choice_input_mask,
+                "token_type_ids": multiple_choice_token_type_ids,
+            }
+            (logits,) = model(inputs)
             result = {
                 "logits": logits.numpy(),
             }
-            self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.num_choices])
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
 
-
-        def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = TFBertForTokenClassification(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            logits, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (logits,) = model(inputs)
             result = {
                 "logits": logits.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.seq_length, self.num_labels])
+                list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
+            )
 
-
-        def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFBertForQuestionAnswering(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             start_logits, end_logits = model(inputs)
             result = {
                 "start_logits": start_logits.numpy(),
                 "end_logits": end_logits.numpy(),
             }
-            self.parent.assertListEqual(
-                list(result["start_logits"].shape),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].shape),
-                [self.batch_size, self.seq_length])
-
+            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -310,10 +315,10 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
     @slow
     def test_model_from_pretrained(self):
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ['bert-base-uncased']:
+        for model_name in ["bert-base-uncased"]:
             model = TFBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
-
diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py
index 5a5873e81b..d65e270ae1 100644
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -35,6 +35,7 @@ if is_tf_available():
     import tensorflow as tf
     import numpy as np
     from transformers import TFPreTrainedModel
+
     # from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 if sys.version_info[0] == 2:
@@ -42,25 +43,31 @@ if sys.version_info[0] == 2:
 
     class TemporaryDirectory(object):
         """Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
+
         def __enter__(self):
             self.name = tempfile.mkdtemp()
             return self.name
+
         def __exit__(self, exc_type, exc_value, traceback):
             shutil.rmtree(self.name)
+
+
 else:
     import pickle
+
     TemporaryDirectory = tempfile.TemporaryDirectory
     unicode = str
 
+
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
     for key in configs_no_init.__dict__.keys():
-        if '_range' in key or '_std' in key:
+        if "_range" in key or "_std" in key:
             setattr(configs_no_init, key, 0.0)
     return configs_no_init
 
-class TFCommonTestCases:
 
+class TFCommonTestCases:
     @require_tf
     class TFCommonModelTester(unittest.TestCase):
 
@@ -126,8 +133,9 @@ class TFCommonTestCases:
 
                 # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
                 pt_model.eval()
-                pt_inputs_dict = dict((name, torch.from_numpy(key.numpy()).to(torch.long))
-                                      for name, key in inputs_dict.items())
+                pt_inputs_dict = dict(
+                    (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items()
+                )
                 with torch.no_grad():
                     pto = pt_model(**pt_inputs_dict)
                 tfo = tf_model(inputs_dict, training=False)
@@ -140,18 +148,19 @@ class TFCommonTestCases:
 
                 # Check we can load pt model in tf and vice-versa with checkpoint => model functions
                 with TemporaryDirectory() as tmpdirname:
-                    pt_checkpoint_path = os.path.join(tmpdirname, 'pt_model.bin')
+                    pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
                     torch.save(pt_model.state_dict(), pt_checkpoint_path)
                     tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
 
-                    tf_checkpoint_path = os.path.join(tmpdirname, 'tf_model.h5')
+                    tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
                     tf_model.save_weights(tf_checkpoint_path)
                     pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
 
                 # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
                 pt_model.eval()
-                pt_inputs_dict = dict((name, torch.from_numpy(key.numpy()).to(torch.long))
-                                      for name, key in inputs_dict.items())
+                pt_inputs_dict = dict(
+                    (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items()
+                )
                 with torch.no_grad():
                     pto = pt_model(**pt_inputs_dict)
                 tfo = tf_model(inputs_dict)
@@ -166,13 +175,19 @@ class TFCommonTestCases:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
             if self.is_encoder_decoder:
-                input_ids = {'decoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='decoder_input_ids', dtype='int32'),
-                             'encoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='encoder_input_ids', dtype='int32')}
+                input_ids = {
+                    "decoder_input_ids": tf.keras.Input(
+                        batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"
+                    ),
+                    "encoder_input_ids": tf.keras.Input(
+                        batch_shape=(2, 2000), name="encoder_input_ids", dtype="int32"
+                    ),
+                }
             else:
-                input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32')
+                input_ids = tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32")
             optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
             loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-            metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+            metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
 
             for model_class in self.all_model_classes:
                 # Prepare our model
@@ -188,7 +203,7 @@ class TFCommonTestCases:
                 hidden_states = outputs_dict[0]
 
                 # Add a dense layer on top to test intetgration with other keras modules
-                outputs = tf.keras.layers.Dense(2, activation='softmax', name='outputs')(hidden_states)
+                outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
 
                 # Compile extended model
                 extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
@@ -202,7 +217,9 @@ class TFCommonTestCases:
                 outputs_dict = model(inputs_dict)
 
                 inputs_keywords = copy.deepcopy(inputs_dict)
-                input_ids = inputs_keywords.pop('input_ids' if not self.is_encoder_decoder else 'decoder_input_ids', None)
+                input_ids = inputs_keywords.pop(
+                    "input_ids" if not self.is_encoder_decoder else "decoder_input_ids", None
+                )
                 outputs_keywords = model(input_ids, **inputs_keywords)
 
                 output_dict = outputs_dict[0].numpy()
@@ -213,10 +230,22 @@ class TFCommonTestCases:
         def test_attention_outputs(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-            decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
-            encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
-            decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
-            encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
+            decoder_seq_length = (
+                self.model_tester.decoder_seq_length
+                if hasattr(self.model_tester, "decoder_seq_length")
+                else self.model_tester.seq_length
+            )
+            encoder_seq_length = (
+                self.model_tester.encoder_seq_length
+                if hasattr(self.model_tester, "encoder_seq_length")
+                else self.model_tester.seq_length
+            )
+            decoder_key_length = (
+                self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length
+            )
+            encoder_key_length = (
+                self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length
+            )
 
             for model_class in self.all_model_classes:
                 config.output_attentions = True
@@ -229,22 +258,20 @@ class TFCommonTestCases:
                 self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
                 self.assertListEqual(
                     list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads,
-                    encoder_seq_length,
-                    encoder_key_length])
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
                 out_len = len(outputs)
 
                 if self.is_encoder_decoder:
                     self.assertEqual(out_len % 2, 0)
-                    decoder_attentions = outputs[(out_len // 2)-1]
+                    decoder_attentions = outputs[(out_len // 2) - 1]
                     self.assertEqual(model.config.output_attentions, True)
                     self.assertEqual(model.config.output_hidden_states, False)
                     self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
                     self.assertListEqual(
                         list(decoder_attentions[0].shape[-3:]),
-                        [self.model_tester.num_attention_heads,
-                         decoder_seq_length,
-                         decoder_key_length])
+                        [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                    )
 
                 # Check attention is always last and order is fine
                 config.output_attentions = True
@@ -259,9 +286,8 @@ class TFCommonTestCases:
                 self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
                 self.assertListEqual(
                     list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads,
-                    encoder_seq_length,
-                    encoder_key_length])
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
 
         def test_hidden_states_output(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -276,8 +302,8 @@ class TFCommonTestCases:
                 self.assertEqual(model.config.output_hidden_states, True)
                 self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
                 self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size])
+                    list(hidden_states[0].shape[-2:]), [self.model_tester.seq_length, self.model_tester.hidden_size]
+                )
 
         def test_model_common_attributes(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -357,9 +383,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
     for _ in range(total_dims):
         values.append(rng.randint(0, vocab_size - 1))
 
-    output = tf.constant(values,
-                         shape=shape,
-                         dtype=dtype if dtype is not None else tf.int32)
+    output = tf.constant(values, shape=shape, dtype=dtype if dtype is not None else tf.int32)
 
     return output
 
diff --git a/transformers/tests/modeling_tf_ctrl_test.py b/transformers/tests/modeling_tf_ctrl_test.py
index 93b231e517..fb8c4c2551 100644
--- a/transformers/tests/modeling_tf_ctrl_test.py
+++ b/transformers/tests/modeling_tf_ctrl_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 import unittest
 import sys
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -27,8 +27,7 @@ from transformers import CTRLConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_ctrl import (TFCTRLModel, TFCTRLLMHeadModel,
-                                                TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
 @require_tf
@@ -37,32 +36,32 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
     all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else ()
 
     class TFCTRLModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_token_type_ids=True,
-                     use_input_mask=True,
-                     use_labels=True,
-                     use_mc_token_ids=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_token_type_ids=True,
+            use_input_mask=True,
+            use_labels=True,
+            use_mc_token_ids=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -127,13 +126,21 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
 
             head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
 
-            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+            return (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            )
 
         def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = TFCTRLModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             sequence_output = model(inputs)[0]
 
             inputs = [input_ids, None, input_mask]  # None is the input for 'past'
@@ -145,30 +152,36 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
                 "sequence_output": sequence_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
-
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
 
         def create_and_check_ctrl_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = TFCTRLLMHeadModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             prediction_scores = model(inputs)[0]
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
 
-            (config, input_ids, input_mask, head_mask, token_type_ids,
-             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
 
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -192,6 +205,6 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
             model = TFCTRLModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
-
diff --git a/transformers/tests/modeling_tf_distilbert_test.py b/transformers/tests/modeling_tf_distilbert_test.py
index f28b5c397b..3260f63d56 100644
--- a/transformers/tests/modeling_tf_distilbert_test.py
+++ b/transformers/tests/modeling_tf_distilbert_test.py
@@ -18,7 +18,7 @@ from __future__ import print_function
 
 import unittest
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -26,48 +26,58 @@ from transformers import DistilBertConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_distilbert import (TFDistilBertModel,
-                                                             TFDistilBertForMaskedLM,
-                                                             TFDistilBertForQuestionAnswering,
-                                                             TFDistilBertForSequenceClassification)
+    from transformers.modeling_tf_distilbert import (
+        TFDistilBertModel,
+        TFDistilBertForMaskedLM,
+        TFDistilBertForQuestionAnswering,
+        TFDistilBertForSequenceClassification,
+    )
 
 
 @require_tf
 class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
 
-    all_model_classes = (TFDistilBertModel, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering,
-                         TFDistilBertForSequenceClassification) if is_tf_available() else None
+    all_model_classes = (
+        (
+            TFDistilBertModel,
+            TFDistilBertForMaskedLM,
+            TFDistilBertForQuestionAnswering,
+            TFDistilBertForSequenceClassification,
+        )
+        if is_tf_available()
+        else None
+    )
     test_pruning = True
     test_torchscript = True
     test_resize_embeddings = True
     test_head_masking = True
 
     class TFDistilBertModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=False,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=False,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -116,14 +126,16 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
                 dropout=self.hidden_dropout_prob,
                 attention_dropout=self.attention_probs_dropout_prob,
                 max_position_embeddings=self.max_position_embeddings,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
             return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
 
-        def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_model(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFDistilBertModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
 
             outputs = model(inputs)
             sequence_output = outputs[0]
@@ -136,54 +148,51 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
                 "sequence_output": sequence_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
 
-        def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_masked_lm(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFDistilBertForMaskedLM(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
             (prediction_scores,) = model(inputs)
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
-        def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_question_answering(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFDistilBertForQuestionAnswering(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
             start_logits, end_logits = model(inputs)
             result = {
                 "start_logits": start_logits.numpy(),
                 "end_logits": end_logits.numpy(),
             }
-            self.parent.assertListEqual(
-                list(result["start_logits"].shape),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].shape),
-                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
 
-        def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_sequence_classification(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = TFDistilBertForSequenceClassification(config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
             (logits,) = model(inputs)
             result = {
                 "logits": logits.numpy(),
             }
-            self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.num_labels])
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
             (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'attention_mask': input_mask}
+            inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -215,5 +224,6 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
     #         model = DistilBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
     #         self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/modeling_tf_gpt2_test.py b/transformers/tests/modeling_tf_gpt2_test.py
index 90920342ba..09b7eb0710 100644
--- a/transformers/tests/modeling_tf_gpt2_test.py
+++ b/transformers/tests/modeling_tf_gpt2_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 import unittest
 import sys
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -27,45 +27,47 @@ from transformers import GPT2Config, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_gpt2 import (TFGPT2Model, TFGPT2LMHeadModel,
-                                                       TFGPT2DoubleHeadsModel,
-                                                       TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_gpt2 import (
+        TFGPT2Model,
+        TFGPT2LMHeadModel,
+        TFGPT2DoubleHeadsModel,
+        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
 
 @require_tf
 class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
 
-    all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel,
-                         TFGPT2DoubleHeadsModel) if is_tf_available() else ()
+    all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel) if is_tf_available() else ()
     # all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel) if is_tf_available() else ()
 
     class TFGPT2ModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_token_type_ids=True,
-                     use_input_mask=True,
-                     use_labels=True,
-                     use_mc_token_ids=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_token_type_ids=True,
+            use_input_mask=True,
+            use_labels=True,
+            use_mc_token_ids=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -130,13 +132,21 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
 
             head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
 
-            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+            return (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            )
 
         def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = TFGPT2Model(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             sequence_output = model(inputs)[0]
 
             inputs = [input_ids, None, input_mask]  # None is the input for 'past'
@@ -148,54 +158,58 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
                 "sequence_output": sequence_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
-
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
 
         def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = TFGPT2LMHeadModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             prediction_scores = model(inputs)[0]
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
-
-        def create_and_check_gpt2_double_head(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
+        def create_and_check_gpt2_double_head(
+            self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
+        ):
             model = TFGPT2DoubleHeadsModel(config=config)
 
             multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
             multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
             multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
 
-            inputs = {'input_ids': multiple_choice_inputs_ids,
-                      'mc_token_ids': mc_token_ids,
-                      'attention_mask': multiple_choice_input_mask,
-                      'token_type_ids': multiple_choice_token_type_ids}
-            lm_logits, mc_logits = model(inputs)[:2]
-            result = {
-                "lm_logits": lm_logits.numpy(),
-                "mc_logits": mc_logits.numpy()
+            inputs = {
+                "input_ids": multiple_choice_inputs_ids,
+                "mc_token_ids": mc_token_ids,
+                "attention_mask": multiple_choice_input_mask,
+                "token_type_ids": multiple_choice_token_type_ids,
             }
+            lm_logits, mc_logits = model(inputs)[:2]
+            result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
             self.parent.assertListEqual(
-                list(result["lm_logits"].shape),
-                [self.batch_size, self.num_choices, self.seq_length, self.vocab_size])
-            self.parent.assertListEqual(
-                list(result["mc_logits"].shape),
-                [self.batch_size, self.num_choices])
+                list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
+            )
+            self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices])
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
 
-            (config, input_ids, input_mask, head_mask, token_type_ids,
-             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
 
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -223,6 +237,6 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
             model = TFGPT2Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
-
diff --git a/transformers/tests/modeling_tf_openai_gpt_test.py b/transformers/tests/modeling_tf_openai_gpt_test.py
index 065bf2acde..a59395e02b 100644
--- a/transformers/tests/modeling_tf_openai_gpt_test.py
+++ b/transformers/tests/modeling_tf_openai_gpt_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 import unittest
 import sys
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -27,44 +27,48 @@ from transformers import OpenAIGPTConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_openai import (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
-                                                         TFOpenAIGPTDoubleHeadsModel,
-                                                         TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_openai import (
+        TFOpenAIGPTModel,
+        TFOpenAIGPTLMHeadModel,
+        TFOpenAIGPTDoubleHeadsModel,
+        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
 
 @require_tf
 class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
 
-    all_model_classes = (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
-                         TFOpenAIGPTDoubleHeadsModel) if is_tf_available() else ()
+    all_model_classes = (
+        (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel) if is_tf_available() else ()
+    )
 
     class TFOpenAIGPTModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_token_type_ids=True,
-                     use_input_mask=True,
-                     use_labels=True,
-                     use_mc_token_ids=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_token_type_ids=True,
+            use_input_mask=True,
+            use_labels=True,
+            use_mc_token_ids=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -129,13 +133,21 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
 
             head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
 
-            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+            return (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            )
 
         def create_and_check_openai_gpt_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = TFOpenAIGPTModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             sequence_output = model(inputs)[0]
 
             inputs = [input_ids, input_mask]
@@ -147,54 +159,58 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
                 "sequence_output": sequence_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
-
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
 
         def create_and_check_openai_gpt_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = TFOpenAIGPTLMHeadModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             prediction_scores = model(inputs)[0]
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
-
-        def create_and_check_openai_gpt_double_head(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
+        def create_and_check_openai_gpt_double_head(
+            self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
+        ):
             model = TFOpenAIGPTDoubleHeadsModel(config=config)
 
             multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
             multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
             multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
 
-            inputs = {'input_ids': multiple_choice_inputs_ids,
-                      'mc_token_ids': mc_token_ids,
-                      'attention_mask': multiple_choice_input_mask,
-                      'token_type_ids': multiple_choice_token_type_ids}
-            lm_logits, mc_logits = model(inputs)[:2]
-            result = {
-                "lm_logits": lm_logits.numpy(),
-                "mc_logits": mc_logits.numpy()
+            inputs = {
+                "input_ids": multiple_choice_inputs_ids,
+                "mc_token_ids": mc_token_ids,
+                "attention_mask": multiple_choice_input_mask,
+                "token_type_ids": multiple_choice_token_type_ids,
             }
+            lm_logits, mc_logits = model(inputs)[:2]
+            result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
             self.parent.assertListEqual(
-                list(result["lm_logits"].shape),
-                [self.batch_size, self.num_choices, self.seq_length, self.vocab_size])
-            self.parent.assertListEqual(
-                list(result["mc_logits"].shape),
-                [self.batch_size, self.num_choices])
+                list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
+            )
+            self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices])
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
 
-            (config, input_ids, input_mask, head_mask, token_type_ids,
-             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
 
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -222,6 +238,6 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
             model = TFOpenAIGPTModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
-
diff --git a/transformers/tests/modeling_tf_roberta_test.py b/transformers/tests/modeling_tf_roberta_test.py
index 93c478ae28..23ea557404 100644
--- a/transformers/tests/modeling_tf_roberta_test.py
+++ b/transformers/tests/modeling_tf_roberta_test.py
@@ -18,7 +18,7 @@ from __future__ import print_function
 
 import unittest
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -27,44 +27,48 @@ from transformers import RobertaConfig, is_tf_available
 if is_tf_available():
     import tensorflow as tf
     import numpy
-    from transformers.modeling_tf_roberta import (TFRobertaModel, TFRobertaForMaskedLM,
-                                                          TFRobertaForSequenceClassification,
-                                                          TFRobertaForTokenClassification,
-                                                          TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_roberta import (
+        TFRobertaModel,
+        TFRobertaForMaskedLM,
+        TFRobertaForSequenceClassification,
+        TFRobertaForTokenClassification,
+        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
 
 @require_tf
 class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
 
-    all_model_classes = (TFRobertaModel,TFRobertaForMaskedLM,
-                         TFRobertaForSequenceClassification) if is_tf_available() else ()
+    all_model_classes = (
+        (TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification) if is_tf_available() else ()
+    )
 
     class TFRobertaModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -118,16 +122,16 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
                 attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                 max_position_embeddings=self.max_position_embeddings,
                 type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
             return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
-        def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
-                                           token_labels, choice_labels):
+        def create_and_check_roberta_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFRobertaModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             sequence_output = model(inputs)[0]
 
             inputs = [input_ids, input_mask]
@@ -139,39 +143,47 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
                 "sequence_output": sequence_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
 
-        def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
-                                                   token_labels, choice_labels):
+        def create_and_check_roberta_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFRobertaForMaskedLM(config=config)
             prediction_scores = model([input_ids, input_mask, token_type_ids])[0]
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
-        def create_and_check_roberta_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_roberta_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = TFRobertaForTokenClassification(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            logits, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (logits,) = model(inputs)
             result = {
                 "logits": logits.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.seq_length, self.num_labels])
+                list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -196,61 +208,43 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
             self.assertIsNotNone(model)
 
 
-
 class TFRobertaModelIntegrationTest(unittest.TestCase):
-
     @slow
     def test_inference_masked_lm(self):
-        model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
+        model = TFRobertaForMaskedLM.from_pretrained("roberta-base")
 
-        input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         output = model(input_ids)[0]
         expected_shape = [1, 11, 50265]
-        self.assertEqual(
-            list(output.numpy().shape),
-            expected_shape
-        )
+        self.assertEqual(list(output.numpy().shape), expected_shape)
         # compare the actual values for a slice.
         expected_slice = tf.constant(
-            [[[33.8843, -4.3107, 22.7779],
-              [ 4.6533, -2.8099, 13.6252],
-              [ 1.8222, -3.6898,  8.8600]]]
-        )
-        self.assertTrue(
-            numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
+            [[[33.8843, -4.3107, 22.7779], [4.6533, -2.8099, 13.6252], [1.8222, -3.6898, 8.8600]]]
         )
+        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3))
 
     @slow
     def test_inference_no_head(self):
-        model = TFRobertaModel.from_pretrained('roberta-base')
+        model = TFRobertaModel.from_pretrained("roberta-base")
 
-        input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         output = model(input_ids)[0]
         # compare the actual values for a slice.
         expected_slice = tf.constant(
-            [[[-0.0231,  0.0782,  0.0074],
-              [-0.1854,  0.0539, -0.0174],
-              [ 0.0548,  0.0799,  0.1687]]]
-        )
-        self.assertTrue(
-            numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
+            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0539, -0.0174], [0.0548, 0.0799, 0.1687]]]
         )
+        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3))
 
     @slow
     def test_inference_classification_head(self):
-        model = TFRobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
+        model = TFRobertaForSequenceClassification.from_pretrained("roberta-large-mnli")
 
-        input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         output = model(input_ids)[0]
         expected_shape = [1, 3]
-        self.assertEqual(
-            list(output.numpy().shape),
-            expected_shape
-        )
-        expected_tensor = tf.constant([[-0.9469,  0.3913,  0.5118]])
-        self.assertTrue(
-            numpy.allclose(output.numpy(), expected_tensor.numpy(), atol=1e-3)
-        )
+        self.assertEqual(list(output.numpy().shape), expected_shape)
+        expected_tensor = tf.constant([[-0.9469, 0.3913, 0.5118]])
+        self.assertTrue(numpy.allclose(output.numpy(), expected_tensor.numpy(), atol=1e-3))
 
 
 if __name__ == "__main__":
diff --git a/transformers/tests/modeling_tf_t5_test.py b/transformers/tests/modeling_tf_t5_test.py
index da9ce6f89d..521085219b 100644
--- a/transformers/tests/modeling_tf_t5_test.py
+++ b/transformers/tests/modeling_tf_t5_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 import unittest
 import sys
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -27,8 +27,7 @@ from transformers import T5Config, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_t5 import (TFT5Model, TFT5WithLMHeadModel,
-                                             TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
 @require_tf
@@ -38,25 +37,25 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
     all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if is_tf_available() else ()
 
     class TFT5ModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     n_positions=14,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     d_ff=37,
-                     relative_attention_num_buckets=8,
-                     dropout_rate=0.1,
-                     initializer_factor=0.002,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_labels=True,
+            vocab_size=99,
+            n_positions=14,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            d_ff=37,
+            relative_attention_num_buckets=8,
+            dropout_rate=0.1,
+            initializer_factor=0.002,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -95,53 +94,58 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
                 num_heads=self.num_attention_heads,
                 relative_attention_num_buckets=self.relative_attention_num_buckets,
                 dropout_rate=self.dropout_rate,
-                initializer_factor=self.initializer_factor)
+                initializer_factor=self.initializer_factor,
+            )
 
             return (config, input_ids, input_mask, token_labels)
 
         def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
             model = TFT5Model(config=config)
-            inputs = {'encoder_input_ids': input_ids,
-                      'decoder_input_ids': input_ids,
-                      'decoder_attention_mask': input_mask}
+            inputs = {
+                "encoder_input_ids": input_ids,
+                "decoder_input_ids": input_ids,
+                "decoder_attention_mask": input_mask,
+            }
             encoder_output, decoder_output = model(inputs)
 
-            encoder_output, decoder_output = model(input_ids,
-                                                   decoder_attention_mask=input_mask,
-                                                   encoder_input_ids=input_ids)
+            encoder_output, decoder_output = model(
+                input_ids, decoder_attention_mask=input_mask, encoder_input_ids=input_ids
+            )
 
             result = {
                 "encoder_output": encoder_output.numpy(),
                 "decoder_output": decoder_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["encoder_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["encoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(
-                list(result["decoder_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
-
+                list(result["decoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
 
         def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
             model = TFT5WithLMHeadModel(config=config)
-            inputs = {'encoder_input_ids': input_ids,
-                      'decoder_input_ids': input_ids,
-                      'decoder_attention_mask': input_mask}
+            inputs = {
+                "encoder_input_ids": input_ids,
+                "decoder_input_ids": input_ids,
+                "decoder_attention_mask": input_mask,
+            }
             prediction_scores, decoder_output = model(inputs)
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
-
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
             (config, input_ids, input_mask, token_labels) = config_and_inputs
-            inputs_dict = {'encoder_input_ids': input_ids,
-                           'decoder_input_ids': input_ids,
-                           'decoder_attention_mask': input_mask}
+            inputs_dict = {
+                "encoder_input_ids": input_ids,
+                "decoder_input_ids": input_ids,
+                "decoder_attention_mask": input_mask,
+            }
             return config, inputs_dict
 
     def setUp(self):
@@ -161,9 +165,10 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in ['t5-small']:
+        for model_name in ["t5-small"]:
             model = TFT5Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/modeling_tf_transfo_xl_test.py b/transformers/tests/modeling_tf_transfo_xl_test.py
index 8225c09275..20de598d05 100644
--- a/transformers/tests/modeling_tf_transfo_xl_test.py
+++ b/transformers/tests/modeling_tf_transfo_xl_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 import unittest
 import random
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -27,9 +27,11 @@ from transformers import TransfoXLConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_transfo_xl import (TFTransfoXLModel,
-                                                             TFTransfoXLLMHeadModel,
-                                                             TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_transfo_xl import (
+        TFTransfoXLModel,
+        TFTransfoXLLMHeadModel,
+        TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
 
 @require_tf
@@ -41,27 +43,27 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
     test_resize_embeddings = False
 
     class TFTransfoXLModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     mem_len=30,
-                     clamp_len=15,
-                     is_training=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     cutoffs=[10, 50, 80],
-                     hidden_size=32,
-                     d_embed=32,
-                     num_attention_heads=4,
-                     d_head=8,
-                     d_inner=128,
-                     div_val=2,
-                     num_hidden_layers=5,
-                     scope=None,
-                     seed=1,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            mem_len=30,
+            clamp_len=15,
+            is_training=True,
+            use_labels=True,
+            vocab_size=99,
+            cutoffs=[10, 50, 80],
+            hidden_size=32,
+            d_embed=32,
+            num_attention_heads=4,
+            d_head=8,
+            d_inner=128,
+            div_val=2,
+            num_hidden_layers=5,
+            scope=None,
+            seed=1,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -101,7 +103,8 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
                 d_head=self.d_head,
                 d_inner=self.d_inner,
                 div_val=self.div_val,
-                n_layer=self.num_hidden_layers)
+                n_layer=self.num_hidden_layers,
+            )
 
             return (config, input_ids_1, input_ids_2, lm_labels)
 
@@ -114,8 +117,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
 
             hidden_states_1, mems_1 = model(input_ids_1)
 
-            inputs = {'input_ids': input_ids_2,
-                      'mems': mems_1}
+            inputs = {"input_ids": input_ids_2, "mems": mems_1}
 
             hidden_states_2, mems_2 = model(inputs)
 
@@ -127,33 +129,31 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
             }
 
             self.parent.assertListEqual(
-                list(result["hidden_states_1"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["hidden_states_1"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(
-                list(result["hidden_states_2"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["hidden_states_2"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
-
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
         def create_and_check_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
             model = TFTransfoXLLMHeadModel(config)
 
             lm_logits_1, mems_1 = model(input_ids_1)
 
-            inputs = {'input_ids': input_ids_1,
-                      'labels': lm_labels}
+            inputs = {"input_ids": input_ids_1, "labels": lm_labels}
             _, mems_1 = model(inputs)
 
             lm_logits_2, mems_2 = model([input_ids_2, mems_1])
 
-            inputs = {'input_ids': input_ids_1,
-                      'mems': mems_1,
-                      'labels': lm_labels}
+            inputs = {"input_ids": input_ids_1, "mems": mems_1, "labels": lm_labels}
 
             _, mems_2 = model(inputs)
 
@@ -165,26 +165,27 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
             }
 
             self.parent.assertListEqual(
-                list(result["lm_logits_1"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["lm_logits_1"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
             self.parent.assertListEqual(
-                list(result["lm_logits_2"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["lm_logits_2"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
             (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids_1}
+            inputs_dict = {"input_ids": input_ids_1}
             return config, inputs_dict
 
-
     def setUp(self):
         self.model_tester = TFTransfoXLModelTest.TFTransfoXLModelTester(self)
         self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
diff --git a/transformers/tests/modeling_tf_xlm_test.py b/transformers/tests/modeling_tf_xlm_test.py
index 065d355b45..9162bf2b38 100644
--- a/transformers/tests/modeling_tf_xlm_test.py
+++ b/transformers/tests/modeling_tf_xlm_test.py
@@ -22,13 +22,16 @@ from transformers import is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers import (XLMConfig, TFXLMModel,
-                                      TFXLMWithLMHeadModel,
-                                      TFXLMForSequenceClassification,
-                                      TFXLMForQuestionAnsweringSimple,
-                                      TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers import (
+        XLMConfig,
+        TFXLMModel,
+        TFXLMWithLMHeadModel,
+        TFXLMForSequenceClassification,
+        TFXLMForQuestionAnsweringSimple,
+        TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -36,43 +39,44 @@ from .utils import CACHE_DIR, require_tf, slow
 @require_tf
 class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
 
-    all_model_classes = (TFXLMModel, TFXLMWithLMHeadModel,
-                         TFXLMForSequenceClassification,
-                         TFXLMForQuestionAnsweringSimple) if is_tf_available() else ()
-
+    all_model_classes = (
+        (TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple)
+        if is_tf_available()
+        else ()
+    )
 
     class TFXLMModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_lengths=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     gelu_activation=True,
-                     sinusoidal_embeddings=False,
-                     causal=False,
-                     asm=False,
-                     n_langs=2,
-                     vocab_size=99,
-                     n_special=0,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     summary_type="last",
-                     use_proj=True,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_lengths=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            gelu_activation=True,
+            sinusoidal_embeddings=False,
+            causal=False,
+            asm=False,
+            n_langs=2,
+            vocab_size=99,
+            n_special=0,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            summary_type="last",
+            use_proj=True,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -109,7 +113,9 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
 
             input_lengths = None
             if self.use_input_lengths:
-                input_lengths = ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2  # small variation of seq_length
+                input_lengths = (
+                    ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
+                )  # small variation of seq_length
 
             token_type_ids = None
             if self.use_token_type_ids:
@@ -124,30 +130,48 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
                 is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
 
             config = XLMConfig(
-                 vocab_size=self.vocab_size,
-                 n_special=self.n_special,
-                 emb_dim=self.hidden_size,
-                 n_layers=self.num_hidden_layers,
-                 n_heads=self.num_attention_heads,
-                 dropout=self.hidden_dropout_prob,
-                 attention_dropout=self.attention_probs_dropout_prob,
-                 gelu_activation=self.gelu_activation,
-                 sinusoidal_embeddings=self.sinusoidal_embeddings,
-                 asm=self.asm,
-                 causal=self.causal,
-                 n_langs=self.n_langs,
-                 max_position_embeddings=self.max_position_embeddings,
-                 initializer_range=self.initializer_range,
-                 summary_type=self.summary_type,
-                 use_proj=self.use_proj)
+                vocab_size=self.vocab_size,
+                n_special=self.n_special,
+                emb_dim=self.hidden_size,
+                n_layers=self.num_hidden_layers,
+                n_heads=self.num_attention_heads,
+                dropout=self.hidden_dropout_prob,
+                attention_dropout=self.attention_probs_dropout_prob,
+                gelu_activation=self.gelu_activation,
+                sinusoidal_embeddings=self.sinusoidal_embeddings,
+                asm=self.asm,
+                causal=self.causal,
+                n_langs=self.n_langs,
+                max_position_embeddings=self.max_position_embeddings,
+                initializer_range=self.initializer_range,
+                summary_type=self.summary_type,
+                use_proj=self.use_proj,
+            )
 
-            return config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask
+            return (
+                config,
+                input_ids,
+                token_type_ids,
+                input_lengths,
+                sequence_labels,
+                token_labels,
+                is_impossible_labels,
+                input_mask,
+            )
 
-        def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+        def create_and_check_xlm_model(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            input_mask,
+        ):
             model = TFXLMModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'lengths': input_lengths,
-                      'langs': token_type_ids}
+            inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
             outputs = model(inputs)
 
             inputs = [input_ids, input_mask]
@@ -157,16 +181,23 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
                 "sequence_output": sequence_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
 
-
-        def create_and_check_xlm_lm_head(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+        def create_and_check_xlm_lm_head(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            input_mask,
+        ):
             model = TFXLMWithLMHeadModel(config)
 
-            inputs = {'input_ids': input_ids,
-                      'lengths': input_lengths,
-                      'langs': token_type_ids}
+            inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
             outputs = model(inputs)
 
             logits = outputs[0]
@@ -176,15 +207,23 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
             }
 
             self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
-
-        def create_and_check_xlm_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+        def create_and_check_xlm_qa(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            input_mask,
+        ):
             model = TFXLMForQuestionAnsweringSimple(config)
 
-            inputs = {'input_ids': input_ids,
-                      'lengths': input_lengths}
+            inputs = {"input_ids": input_ids, "lengths": input_lengths}
 
             outputs = model(inputs)
             start_logits, end_logits = model(inputs)
@@ -194,19 +233,23 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
                 "end_logits": end_logits.numpy(),
             }
 
-            self.parent.assertListEqual(
-                list(result["start_logits"].shape),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].shape),
-                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
 
-
-        def create_and_check_xlm_sequence_classif(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+        def create_and_check_xlm_sequence_classif(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            input_mask,
+        ):
             model = TFXLMForSequenceClassification(config)
 
-            inputs = {'input_ids': input_ids,
-                      'lengths': input_lengths}
+            inputs = {"input_ids": input_ids, "lengths": input_lengths}
 
             (logits,) = model(inputs)
 
@@ -214,16 +257,26 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
                 "logits": logits.numpy(),
             }
 
-            self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.type_sequence_label_size])
-
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_lengths,
-             sequence_labels, token_labels, is_impossible_labels, input_mask) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'langs': token_type_ids, 'lengths': input_lengths}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_lengths,
+                sequence_labels,
+                token_labels,
+                is_impossible_labels,
+                input_mask,
+            ) = config_and_inputs
+            inputs_dict = {
+                "input_ids": input_ids,
+                "token_type_ids": token_type_ids,
+                "langs": token_type_ids,
+                "lengths": input_lengths,
+            }
             return config, inputs_dict
 
     def setUp(self):
diff --git a/transformers/tests/modeling_tf_xlnet_test.py b/transformers/tests/modeling_tf_xlnet_test.py
index 15fd917481..9a56384a0c 100644
--- a/transformers/tests/modeling_tf_xlnet_test.py
+++ b/transformers/tests/modeling_tf_xlnet_test.py
@@ -26,13 +26,16 @@ from transformers import XLNetConfig, is_tf_available
 if is_tf_available():
     import tensorflow as tf
 
-    from transformers.modeling_tf_xlnet import (TFXLNetModel, TFXLNetLMHeadModel,
-                                                        TFXLNetForSequenceClassification,
-                                                        TFXLNetForTokenClassification,
-                                                        TFXLNetForQuestionAnsweringSimple,
-                                                        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_xlnet import (
+        TFXLNetModel,
+        TFXLNetLMHeadModel,
+        TFXLNetForSequenceClassification,
+        TFXLNetForTokenClassification,
+        TFXLNetForQuestionAnsweringSimple,
+        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -40,37 +43,44 @@ from .utils import CACHE_DIR, require_tf, slow
 @require_tf
 class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
 
-    all_model_classes=(TFXLNetModel, TFXLNetLMHeadModel,
-                       TFXLNetForSequenceClassification,
-                       TFXLNetForTokenClassification,
-                       TFXLNetForQuestionAnsweringSimple) if is_tf_available() else ()
+    all_model_classes = (
+        (
+            TFXLNetModel,
+            TFXLNetLMHeadModel,
+            TFXLNetForSequenceClassification,
+            TFXLNetForTokenClassification,
+            TFXLNetForQuestionAnsweringSimple,
+        )
+        if is_tf_available()
+        else ()
+    )
     test_pruning = False
 
     class TFXLNetModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     mem_len=10,
-                     clamp_len=-1,
-                     reuse_len=15,
-                     is_training=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     cutoffs=[10, 50, 80],
-                     hidden_size=32,
-                     num_attention_heads=4,
-                     d_inner=128,
-                     num_hidden_layers=5,
-                     type_sequence_label_size=2,
-                     untie_r=True,
-                     bi_data=False,
-                     same_length=False,
-                     initializer_range=0.05,
-                     seed=1,
-                     type_vocab_size=2,
-            ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            mem_len=10,
+            clamp_len=-1,
+            reuse_len=15,
+            is_training=True,
+            use_labels=True,
+            vocab_size=99,
+            cutoffs=[10, 50, 80],
+            hidden_size=32,
+            num_attention_heads=4,
+            d_inner=128,
+            num_hidden_layers=5,
+            type_sequence_label_size=2,
+            untie_r=True,
+            bi_data=False,
+            same_length=False,
+            initializer_range=0.05,
+            seed=1,
+            type_vocab_size=2,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -131,22 +141,44 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
                 reuse_len=self.reuse_len,
                 bi_data=self.bi_data,
                 initializer_range=self.initializer_range,
-                num_labels=self.type_sequence_label_size)
+                num_labels=self.type_sequence_label_size,
+            )
 
-            return (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                    target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels)
+            return (
+                config,
+                input_ids_1,
+                input_ids_2,
+                input_ids_q,
+                perm_mask,
+                input_mask,
+                target_mapping,
+                segment_ids,
+                lm_labels,
+                sequence_labels,
+                is_impossible_labels,
+            )
 
         def set_seed(self):
             random.seed(self.seed)
             tf.random.set_seed(self.seed)
 
-        def create_and_check_xlnet_base_model(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+        def create_and_check_xlnet_base_model(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+        ):
             model = TFXLNetModel(config)
 
-            inputs = {'input_ids': input_ids_1,
-                      'input_mask': input_mask,
-                      'token_type_ids': segment_ids}
+            inputs = {"input_ids": input_ids_1, "input_mask": input_mask, "token_type_ids": segment_ids}
 
             _, _ = model(inputs)
 
@@ -165,30 +197,38 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
             self.parent.assertEqual(len(no_mems_outputs), 1)
 
             self.parent.assertListEqual(
-                list(result["outputs"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["outputs"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
-        def create_and_check_xlnet_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+        def create_and_check_xlnet_lm_head(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+        ):
             model = TFXLNetLMHeadModel(config)
 
-            inputs_1 = {'input_ids': input_ids_1,
-                      'token_type_ids': segment_ids}
+            inputs_1 = {"input_ids": input_ids_1, "token_type_ids": segment_ids}
 
             all_logits_1, mems_1 = model(inputs_1)
 
-            inputs_2 = {'input_ids': input_ids_2,
-                        'mems': mems_1,
-                        'token_type_ids': segment_ids}
+            inputs_2 = {"input_ids": input_ids_2, "mems": mems_1, "token_type_ids": segment_ids}
 
             all_logits_2, mems_2 = model(inputs_2)
 
-            inputs_3 = {'input_ids': input_ids_q,
-                        'perm_mask': perm_mask,
-                        'target_mapping': target_mapping}
+            inputs_3 = {"input_ids": input_ids_q, "perm_mask": perm_mask, "target_mapping": target_mapping}
 
             logits, _ = model(inputs_3)
 
@@ -200,26 +240,38 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
             }
 
             self.parent.assertListEqual(
-                list(result["all_logits_1"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["all_logits_1"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
             self.parent.assertListEqual(
-                list(result["all_logits_2"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["all_logits_2"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
-        def create_and_check_xlnet_qa(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+        def create_and_check_xlnet_qa(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+        ):
             model = TFXLNetForQuestionAnsweringSimple(config)
 
-            inputs = {'input_ids': input_ids_1,
-                      'attention_mask': input_mask,
-                      'token_type_ids': segment_ids}
+            inputs = {"input_ids": input_ids_1, "attention_mask": input_mask, "token_type_ids": segment_ids}
             start_logits, end_logits, mems = model(inputs)
 
             result = {
@@ -228,18 +280,27 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
                 "mems": [m.numpy() for m in mems],
             }
 
-            self.parent.assertListEqual(
-                list(result["start_logits"].shape),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].shape),
-                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
-        def create_and_check_xlnet_sequence_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+        def create_and_check_xlnet_sequence_classif(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+        ):
             model = TFXLNetForSequenceClassification(config)
 
             logits, mems_1 = model(input_ids_1)
@@ -249,42 +310,64 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
                 "logits": logits.numpy(),
             }
 
-            self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.type_sequence_label_size])
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
-        def create_and_check_xlnet_for_token_classification(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+        def create_and_check_xlnet_for_token_classification(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+        ):
             config.num_labels = input_ids_1.shape[1]
             model = TFXLNetForTokenClassification(config)
-            inputs = {'input_ids': input_ids_1,
-                      'attention_mask': input_mask,
-                      # 'token_type_ids': token_type_ids
-                      }
+            inputs = {
+                "input_ids": input_ids_1,
+                "attention_mask": input_mask,
+                # 'token_type_ids': token_type_ids
+            }
             logits, mems_1 = model(inputs)
             result = {
                 "mems_1": [mem.numpy() for mem in mems_1],
                 "logits": logits.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.seq_length, config.num_labels])
+                list(result["logits"].shape), [self.batch_size, self.seq_length, config.num_labels]
+            )
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels,
-                sequence_labels, is_impossible_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids_1}
+            (
+                config,
+                input_ids_1,
+                input_ids_2,
+                input_ids_q,
+                perm_mask,
+                input_mask,
+                target_mapping,
+                segment_ids,
+                lm_labels,
+                sequence_labels,
+                is_impossible_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids_1}
             return config, inputs_dict
 
-
     def setUp(self):
         self.model_tester = TFXLNetModelTest.TFXLNetModelTester(self)
         self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
diff --git a/transformers/tests/modeling_transfo_xl_test.py b/transformers/tests/modeling_transfo_xl_test.py
index acbe95fe4a..f04205d4e0 100644
--- a/transformers/tests/modeling_transfo_xl_test.py
+++ b/transformers/tests/modeling_transfo_xl_test.py
@@ -23,10 +23,10 @@ from transformers import is_torch_available
 
 if is_torch_available():
     import torch
-    from transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
+    from transformers import TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel
     from transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
@@ -40,27 +40,27 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
     test_resize_embeddings = False
 
     class TransfoXLModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     mem_len=30,
-                     clamp_len=15,
-                     is_training=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     cutoffs=[10, 50, 80],
-                     hidden_size=32,
-                     d_embed=32,
-                     num_attention_heads=4,
-                     d_head=8,
-                     d_inner=128,
-                     div_val=2,
-                     num_hidden_layers=5,
-                     scope=None,
-                     seed=1,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            mem_len=30,
+            clamp_len=15,
+            is_training=True,
+            use_labels=True,
+            vocab_size=99,
+            cutoffs=[10, 50, 80],
+            hidden_size=32,
+            d_embed=32,
+            num_attention_heads=4,
+            d_head=8,
+            d_inner=128,
+            div_val=2,
+            num_hidden_layers=5,
+            scope=None,
+            seed=1,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -100,7 +100,8 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
                 d_head=self.d_head,
                 d_inner=self.d_inner,
                 div_val=self.div_val,
-                n_layer=self.num_hidden_layers)
+                n_layer=self.num_hidden_layers,
+            )
 
             return (config, input_ids_1, input_ids_2, lm_labels)
 
@@ -125,18 +126,19 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
 
         def check_transfo_xl_model_output(self, result):
             self.parent.assertListEqual(
-                list(result["hidden_states_1"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["hidden_states_1"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(
-                list(result["hidden_states_2"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["hidden_states_2"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
-
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
         def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
             model = TransfoXLLMHeadModel(config)
@@ -159,33 +161,30 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
             return outputs
 
         def check_transfo_xl_lm_head_output(self, result):
+            self.parent.assertListEqual(list(result["loss_1"].size()), [self.batch_size, self.seq_length])
             self.parent.assertListEqual(
-                list(result["loss_1"].size()),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["lm_logits_1"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["lm_logits_1"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
+            self.parent.assertListEqual(list(result["loss_2"].size()), [self.batch_size, self.seq_length])
             self.parent.assertListEqual(
-                list(result["loss_2"].size()),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["lm_logits_2"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["lm_logits_2"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
             (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids_1}
+            inputs_dict = {"input_ids": input_ids_1}
             return config, inputs_dict
 
-
     def setUp(self):
         self.model_tester = TransfoXLModelTest.TransfoXLModelTester(self)
         self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
diff --git a/transformers/tests/modeling_xlm_test.py b/transformers/tests/modeling_xlm_test.py
index fcc2f4699b..843693fd03 100644
--- a/transformers/tests/modeling_xlm_test.py
+++ b/transformers/tests/modeling_xlm_test.py
@@ -21,11 +21,17 @@ import unittest
 from transformers import is_torch_available
 
 if is_torch_available():
-    from transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering,
-                                      XLMForSequenceClassification, XLMForQuestionAnsweringSimple)
+    from transformers import (
+        XLMConfig,
+        XLMModel,
+        XLMWithLMHeadModel,
+        XLMForQuestionAnswering,
+        XLMForSequenceClassification,
+        XLMForQuestionAnsweringSimple,
+    )
     from transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
@@ -33,42 +39,50 @@ from .utils import CACHE_DIR, require_torch, slow, torch_device
 @require_torch
 class XLMModelTest(CommonTestCases.CommonModelTester):
 
-    all_model_classes = (XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering,
-                         XLMForSequenceClassification, XLMForQuestionAnsweringSimple) if is_torch_available() else ()
-
+    all_model_classes = (
+        (
+            XLMModel,
+            XLMWithLMHeadModel,
+            XLMForQuestionAnswering,
+            XLMForSequenceClassification,
+            XLMForQuestionAnsweringSimple,
+        )
+        if is_torch_available()
+        else ()
+    )
 
     class XLMModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_lengths=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     gelu_activation=True,
-                     sinusoidal_embeddings=False,
-                     causal=False,
-                     asm=False,
-                     n_langs=2,
-                     vocab_size=99,
-                     n_special=0,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     summary_type="last",
-                     use_proj=True,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_lengths=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            gelu_activation=True,
+            sinusoidal_embeddings=False,
+            causal=False,
+            asm=False,
+            n_langs=2,
+            vocab_size=99,
+            n_special=0,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            summary_type="last",
+            use_proj=True,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -105,7 +119,9 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
 
             input_lengths = None
             if self.use_input_lengths:
-                input_lengths = ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2  # small variation of seq_length
+                input_lengths = (
+                    ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
+                )  # small variation of seq_length
 
             token_type_ids = None
             if self.use_token_type_ids:
@@ -120,31 +136,49 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                 is_impossible_labels = ids_tensor([self.batch_size], 2).float()
 
             config = XLMConfig(
-                 vocab_size=self.vocab_size,
-                 n_special=self.n_special,
-                 emb_dim=self.hidden_size,
-                 n_layers=self.num_hidden_layers,
-                 n_heads=self.num_attention_heads,
-                 dropout=self.hidden_dropout_prob,
-                 attention_dropout=self.attention_probs_dropout_prob,
-                 gelu_activation=self.gelu_activation,
-                 sinusoidal_embeddings=self.sinusoidal_embeddings,
-                 asm=self.asm,
-                 causal=self.causal,
-                 n_langs=self.n_langs,
-                 max_position_embeddings=self.max_position_embeddings,
-                 initializer_range=self.initializer_range,
-                 summary_type=self.summary_type,
-                 use_proj=self.use_proj)
+                vocab_size=self.vocab_size,
+                n_special=self.n_special,
+                emb_dim=self.hidden_size,
+                n_layers=self.num_hidden_layers,
+                n_heads=self.num_attention_heads,
+                dropout=self.hidden_dropout_prob,
+                attention_dropout=self.attention_probs_dropout_prob,
+                gelu_activation=self.gelu_activation,
+                sinusoidal_embeddings=self.sinusoidal_embeddings,
+                asm=self.asm,
+                causal=self.causal,
+                n_langs=self.n_langs,
+                max_position_embeddings=self.max_position_embeddings,
+                initializer_range=self.initializer_range,
+                summary_type=self.summary_type,
+                use_proj=self.use_proj,
+            )
 
-            return config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask
+            return (
+                config,
+                input_ids,
+                token_type_ids,
+                input_lengths,
+                sequence_labels,
+                token_labels,
+                is_impossible_labels,
+                input_mask,
+            )
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+            self.parent.assertListEqual(list(result["loss"].size()), [])
 
-        def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+        def create_and_check_xlm_model(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            input_mask,
+        ):
             model = XLMModel(config=config)
             model.to(torch_device)
             model.eval()
@@ -156,11 +190,20 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                 "sequence_output": sequence_output,
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
 
-
-        def create_and_check_xlm_lm_head(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+        def create_and_check_xlm_lm_head(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            input_mask,
+        ):
             model = XLMWithLMHeadModel(config)
             model.to(torch_device)
             model.eval()
@@ -172,23 +215,29 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                 "logits": logits,
             }
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
-
-        def create_and_check_xlm_simple_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+        def create_and_check_xlm_simple_qa(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            input_mask,
+        ):
             model = XLMForQuestionAnsweringSimple(config)
             model.to(torch_device)
             model.eval()
 
             outputs = model(input_ids)
 
-            outputs = model(input_ids, start_positions=sequence_labels,
-                                       end_positions=sequence_labels)
+            outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
             loss, start_logits, end_logits = outputs
 
             result = {
@@ -196,16 +245,21 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                 "start_logits": start_logits,
                 "end_logits": end_logits,
             }
-            self.parent.assertListEqual(
-                list(result["start_logits"].size()),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].size()),
-                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
             self.check_loss_output(result)
 
-
-        def create_and_check_xlm_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+        def create_and_check_xlm_qa(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            input_mask,
+        ):
             model = XLMForQuestionAnswering(config)
             model.to(torch_device)
             model.eval()
@@ -213,21 +267,26 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
             outputs = model(input_ids)
             start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = outputs
 
-            outputs = model(input_ids, start_positions=sequence_labels,
-                                         end_positions=sequence_labels,
-                                         cls_index=sequence_labels,
-                                         is_impossible=is_impossible_labels,
-                                         p_mask=input_mask)
+            outputs = model(
+                input_ids,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+                cls_index=sequence_labels,
+                is_impossible=is_impossible_labels,
+                p_mask=input_mask,
+            )
 
-            outputs = model(input_ids, start_positions=sequence_labels,
-                                         end_positions=sequence_labels,
-                                         cls_index=sequence_labels,
-                                         is_impossible=is_impossible_labels)
+            outputs = model(
+                input_ids,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+                cls_index=sequence_labels,
+                is_impossible=is_impossible_labels,
+            )
 
             (total_loss,) = outputs
 
-            outputs = model(input_ids, start_positions=sequence_labels,
-                                         end_positions=sequence_labels)
+            outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
 
             (total_loss,) = outputs
 
@@ -240,27 +299,34 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                 "cls_logits": cls_logits,
             }
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+                list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top]
+            )
             self.parent.assertListEqual(
-                list(result["start_top_log_probs"].size()),
-                [self.batch_size, model.config.start_n_top])
-            self.parent.assertListEqual(
-                list(result["start_top_index"].size()),
-                [self.batch_size, model.config.start_n_top])
+                list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top]
+            )
             self.parent.assertListEqual(
                 list(result["end_top_log_probs"].size()),
-                [self.batch_size, model.config.start_n_top * model.config.end_n_top])
+                [self.batch_size, model.config.start_n_top * model.config.end_n_top],
+            )
             self.parent.assertListEqual(
                 list(result["end_top_index"].size()),
-                [self.batch_size, model.config.start_n_top * model.config.end_n_top])
-            self.parent.assertListEqual(
-                list(result["cls_logits"].size()),
-                [self.batch_size])
+                [self.batch_size, model.config.start_n_top * model.config.end_n_top],
+            )
+            self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size])
 
-
-        def create_and_check_xlm_sequence_classif(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+        def create_and_check_xlm_sequence_classif(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            input_mask,
+        ):
             model = XLMForSequenceClassification(config)
             model.to(torch_device)
             model.eval()
@@ -273,19 +339,24 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                 "logits": logits,
             }
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.type_sequence_label_size])
-
+                list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size]
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_lengths,
-             sequence_labels, token_labels, is_impossible_labels, input_mask) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'lengths': input_lengths}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_lengths,
+                sequence_labels,
+                token_labels,
+                is_impossible_labels,
+                input_mask,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths}
             return config, inputs_dict
 
     def setUp(self):
diff --git a/transformers/tests/modeling_xlnet_test.py b/transformers/tests/modeling_xlnet_test.py
index 6d218d6ef4..487756a5c8 100644
--- a/transformers/tests/modeling_xlnet_test.py
+++ b/transformers/tests/modeling_xlnet_test.py
@@ -26,11 +26,17 @@ from transformers import is_torch_available
 if is_torch_available():
     import torch
 
-    from transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification,
-                              XLNetForTokenClassification, XLNetForQuestionAnswering)
+    from transformers import (
+        XLNetConfig,
+        XLNetModel,
+        XLNetLMHeadModel,
+        XLNetForSequenceClassification,
+        XLNetForTokenClassification,
+        XLNetForQuestionAnswering,
+    )
     from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
@@ -38,35 +44,44 @@ from .utils import CACHE_DIR, require_torch, slow, torch_device
 @require_torch
 class XLNetModelTest(CommonTestCases.CommonModelTester):
 
-    all_model_classes=(XLNetModel, XLNetLMHeadModel, XLNetForTokenClassification,
-                    XLNetForSequenceClassification, XLNetForQuestionAnswering) if is_torch_available() else ()
+    all_model_classes = (
+        (
+            XLNetModel,
+            XLNetLMHeadModel,
+            XLNetForTokenClassification,
+            XLNetForSequenceClassification,
+            XLNetForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
     test_pruning = False
 
     class XLNetModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     mem_len=10,
-                     clamp_len=-1,
-                     reuse_len=15,
-                     is_training=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     cutoffs=[10, 50, 80],
-                     hidden_size=32,
-                     num_attention_heads=4,
-                     d_inner=128,
-                     num_hidden_layers=5,
-                     type_sequence_label_size=2,
-                     untie_r=True,
-                     bi_data=False,
-                     same_length=False,
-                     initializer_range=0.05,
-                     seed=1,
-                     type_vocab_size=2,
-            ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            mem_len=10,
+            clamp_len=-1,
+            reuse_len=15,
+            is_training=True,
+            use_labels=True,
+            vocab_size=99,
+            cutoffs=[10, 50, 80],
+            hidden_size=32,
+            num_attention_heads=4,
+            d_inner=128,
+            num_hidden_layers=5,
+            type_sequence_label_size=2,
+            untie_r=True,
+            bi_data=False,
+            same_length=False,
+            initializer_range=0.05,
+            seed=1,
+            type_vocab_size=2,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -97,9 +112,13 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
             input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
 
             input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
-            perm_mask = torch.zeros(self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float, device=torch_device)
+            perm_mask = torch.zeros(
+                self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float, device=torch_device
+            )
             perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-            target_mapping = torch.zeros(self.batch_size, 1, self.seq_length + 1, dtype=torch.float, device=torch_device)
+            target_mapping = torch.zeros(
+                self.batch_size, 1, self.seq_length + 1, dtype=torch.float, device=torch_device
+            )
             target_mapping[:, 0, -1] = 1.0  # predict last token
 
             sequence_labels = None
@@ -125,17 +144,43 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                 reuse_len=self.reuse_len,
                 bi_data=self.bi_data,
                 initializer_range=self.initializer_range,
-                num_labels=self.type_sequence_label_size)
+                num_labels=self.type_sequence_label_size,
+            )
 
-            return (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                    target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels)
+            return (
+                config,
+                input_ids_1,
+                input_ids_2,
+                input_ids_q,
+                perm_mask,
+                input_mask,
+                target_mapping,
+                segment_ids,
+                lm_labels,
+                sequence_labels,
+                is_impossible_labels,
+                token_labels,
+            )
 
         def set_seed(self):
             random.seed(self.seed)
             torch.manual_seed(self.seed)
 
-        def create_and_check_xlnet_base_model(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
+        def create_and_check_xlnet_base_model(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+            token_labels,
+        ):
             model = XLNetModel(config)
             model.to(torch_device)
             model.eval()
@@ -158,14 +203,28 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
             self.parent.assertEqual(len(no_mems_outputs), 1)
 
             self.parent.assertListEqual(
-                list(result["outputs"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["outputs"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
-        def create_and_check_xlnet_base_model_with_att_output(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                    target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
+        def create_and_check_xlnet_base_model_with_att_output(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+            token_labels,
+        ):
             model = XLNetModel(config)
             model.to(torch_device)
             model.eval()
@@ -177,15 +236,30 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
             self.parent.assertEqual(len(attentions[0]), 2)
             self.parent.assertTrue(attentions[0][0].shape, attentions[0][0].shape)
 
-        def create_and_check_xlnet_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
+        def create_and_check_xlnet_lm_head(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+            token_labels,
+        ):
             model = XLNetLMHeadModel(config)
             model.to(torch_device)
             model.eval()
 
             loss_1, all_logits_1, mems_1 = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels)
 
-            loss_2, all_logits_2, mems_2 = model(input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=mems_1)
+            loss_2, all_logits_2, mems_2 = model(
+                input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=mems_1
+            )
 
             logits, _ = model(input_ids_q, perm_mask=perm_mask, target_mapping=target_mapping)
 
@@ -198,28 +272,39 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                 "all_logits_2": all_logits_2,
             }
 
+            self.parent.assertListEqual(list(result["loss_1"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss_1"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["all_logits_1"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["all_logits_1"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
+            self.parent.assertListEqual(list(result["loss_2"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss_2"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["all_logits_2"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["all_logits_2"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
-        def create_and_check_xlnet_qa(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
+        def create_and_check_xlnet_qa(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+            token_labels,
+        ):
             model = XLNetForQuestionAnswering(config)
             model.to(torch_device)
             model.eval()
@@ -227,21 +312,26 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
             outputs = model(input_ids_1)
             start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, mems = outputs
 
-            outputs = model(input_ids_1, start_positions=sequence_labels,
-                                         end_positions=sequence_labels,
-                                         cls_index=sequence_labels,
-                                         is_impossible=is_impossible_labels,
-                                         p_mask=input_mask)
+            outputs = model(
+                input_ids_1,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+                cls_index=sequence_labels,
+                is_impossible=is_impossible_labels,
+                p_mask=input_mask,
+            )
 
-            outputs = model(input_ids_1, start_positions=sequence_labels,
-                                         end_positions=sequence_labels,
-                                         cls_index=sequence_labels,
-                                         is_impossible=is_impossible_labels)
+            outputs = model(
+                input_ids_1,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+                cls_index=sequence_labels,
+                is_impossible=is_impossible_labels,
+            )
 
             total_loss, mems = outputs
 
-            outputs = model(input_ids_1, start_positions=sequence_labels,
-                                         end_positions=sequence_labels)
+            outputs = model(input_ids_1, start_positions=sequence_labels, end_positions=sequence_labels)
 
             total_loss, mems = outputs
 
@@ -255,30 +345,42 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                 "mems": mems,
             }
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+                list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top]
+            )
             self.parent.assertListEqual(
-                list(result["start_top_log_probs"].size()),
-                [self.batch_size, model.config.start_n_top])
-            self.parent.assertListEqual(
-                list(result["start_top_index"].size()),
-                [self.batch_size, model.config.start_n_top])
+                list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top]
+            )
             self.parent.assertListEqual(
                 list(result["end_top_log_probs"].size()),
-                [self.batch_size, model.config.start_n_top * model.config.end_n_top])
+                [self.batch_size, model.config.start_n_top * model.config.end_n_top],
+            )
             self.parent.assertListEqual(
                 list(result["end_top_index"].size()),
-                [self.batch_size, model.config.start_n_top * model.config.end_n_top])
-            self.parent.assertListEqual(
-                list(result["cls_logits"].size()),
-                [self.batch_size])
+                [self.batch_size, model.config.start_n_top * model.config.end_n_top],
+            )
+            self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
-        def create_and_check_xlnet_token_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
+        def create_and_check_xlnet_token_classif(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+            token_labels,
+        ):
             model = XLNetForTokenClassification(config)
             model.to(torch_device)
             model.eval()
@@ -292,26 +394,48 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                 "logits": logits,
             }
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.seq_length, self.type_sequence_label_size])
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.type_sequence_label_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels,
-                sequence_labels, is_impossible_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids_1}
+            (
+                config,
+                input_ids_1,
+                input_ids_2,
+                input_ids_q,
+                perm_mask,
+                input_mask,
+                target_mapping,
+                segment_ids,
+                lm_labels,
+                sequence_labels,
+                is_impossible_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids_1}
             return config, inputs_dict
 
-        def create_and_check_xlnet_sequence_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
+        def create_and_check_xlnet_sequence_classif(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+            token_labels,
+        ):
             model = XLNetForSequenceClassification(config)
             model.to(torch_device)
             model.eval()
@@ -325,25 +449,34 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                 "logits": logits,
             }
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.type_sequence_label_size])
+                list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels,
-                sequence_labels, is_impossible_labels, token_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids_1}
+            (
+                config,
+                input_ids_1,
+                input_ids_2,
+                input_ids_q,
+                perm_mask,
+                input_mask,
+                target_mapping,
+                segment_ids,
+                lm_labels,
+                sequence_labels,
+                is_impossible_labels,
+                token_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids_1}
             return config, inputs_dict
 
-
     def setUp(self):
         self.model_tester = XLNetModelTest.XLNetModelTester(self)
         self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
diff --git a/transformers/tests/optimization_test.py b/transformers/tests/optimization_test.py
index cc10ad5908..0addcde1d8 100644
--- a/transformers/tests/optimization_test.py
+++ b/transformers/tests/optimization_test.py
@@ -24,12 +24,14 @@ from transformers import is_torch_available
 if is_torch_available():
     import torch
 
-    from transformers import (AdamW,
-                              get_constant_schedule,
-                              get_constant_schedule_with_warmup,
-                              get_cosine_schedule_with_warmup,
-                              get_cosine_with_hard_restarts_schedule_with_warmup,
-                              get_linear_schedule_with_warmup)
+    from transformers import (
+        AdamW,
+        get_constant_schedule,
+        get_constant_schedule_with_warmup,
+        get_cosine_schedule_with_warmup,
+        get_cosine_with_hard_restarts_schedule_with_warmup,
+        get_linear_schedule_with_warmup,
+    )
 
 from .tokenization_tests_commons import TemporaryDirectory
 from .utils import require_torch
@@ -42,6 +44,7 @@ def unwrap_schedule(scheduler, num_steps=10):
         lrs.append(scheduler.get_lr())
     return lrs
 
+
 def unwrap_and_save_reload_schedule(scheduler, num_steps=10):
     lrs = []
     for step in range(num_steps):
@@ -49,16 +52,16 @@ def unwrap_and_save_reload_schedule(scheduler, num_steps=10):
         lrs.append(scheduler.get_lr())
         if step == num_steps // 2:
             with TemporaryDirectory() as tmpdirname:
-                file_name = os.path.join(tmpdirname, 'schedule.bin')
+                file_name = os.path.join(tmpdirname, "schedule.bin")
                 torch.save(scheduler.state_dict(), file_name)
 
                 state_dict = torch.load(file_name)
                 scheduler.load_state_dict(state_dict)
     return lrs
 
+
 @require_torch
 class OptimizationTest(unittest.TestCase):
-
     def assertListAlmostEqual(self, list1, list2, tol):
         self.assertEqual(len(list1), len(list2))
         for a, b in zip(list1, list2):
@@ -74,7 +77,7 @@ class OptimizationTest(unittest.TestCase):
             loss = criterion(w, target)
             loss.backward()
             optimizer.step()
-            w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves.
+            w.grad.detach_()  # No zero_grad() function on simple tensors. we do it ourselves.
             w.grad.zero_()
         self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
 
@@ -82,7 +85,7 @@ class OptimizationTest(unittest.TestCase):
 @require_torch
 class ScheduleInitTest(unittest.TestCase):
     m = torch.nn.Linear(50, 50) if is_torch_available() else None
-    optimizer = AdamW(m.parameters(), lr=10.) if is_torch_available() else None
+    optimizer = AdamW(m.parameters(), lr=10.0) if is_torch_available() else None
     num_steps = 10
 
     def assertListAlmostEqual(self, list1, list2, tol):
@@ -93,7 +96,7 @@ class ScheduleInitTest(unittest.TestCase):
     def test_constant_scheduler(self):
         scheduler = get_constant_schedule(self.optimizer)
         lrs = unwrap_schedule(scheduler, self.num_steps)
-        expected_learning_rates = [10.] * self.num_steps
+        expected_learning_rates = [10.0] * self.num_steps
         self.assertEqual(len(lrs[0]), 1)
         self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
 
@@ -135,13 +138,17 @@ class ScheduleInitTest(unittest.TestCase):
         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
 
     def test_warmup_cosine_hard_restart_scheduler(self):
-        scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10)
+        scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
+            self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10
+        )
         lrs = unwrap_schedule(scheduler, self.num_steps)
         expected_learning_rates = [5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46, 0.0]
         self.assertEqual(len(lrs[0]), 1)
         self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
 
-        scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10)
+        scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
+            self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10
+        )
         lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
 
diff --git a/transformers/tests/optimization_tf_test.py b/transformers/tests/optimization_tf_test.py
index 515d12a158..e88ee971e4 100644
--- a/transformers/tests/optimization_tf_test.py
+++ b/transformers/tests/optimization_tf_test.py
@@ -12,7 +12,7 @@ if is_tf_available():
     import tensorflow as tf
     from tensorflow.python.eager import context
     from tensorflow.python.framework import ops
-    from transformers import (create_optimizer, GradientAccumulator)
+    from transformers import create_optimizer, GradientAccumulator
 
 
 @require_tf
@@ -21,7 +21,7 @@ class OptimizationFTest(unittest.TestCase):
         self.assertEqual(len(list1), len(list2))
         for a, b in zip(list1, list2):
             self.assertAlmostEqual(a, b, delta=tol)
-    
+
     def testGradientAccumulator(self):
         accumulator = GradientAccumulator()
         accumulator([tf.constant([1.0, 2.0])])
@@ -42,8 +42,8 @@ class OptimizationFTest(unittest.TestCase):
         physical_devices = tf.config.experimental.list_physical_devices("CPU")
         tf.config.experimental.set_virtual_device_configuration(
             physical_devices[0],
-            [tf.config.experimental.VirtualDeviceConfiguration(),
-            tf.config.experimental.VirtualDeviceConfiguration()])
+            [tf.config.experimental.VirtualDeviceConfiguration(), tf.config.experimental.VirtualDeviceConfiguration()],
+        )
 
         devices = tf.config.experimental.list_logical_devices(device_type="CPU")
         strategy = tf.distribute.MirroredStrategy(devices=[device.name for device in devices])
@@ -87,4 +87,4 @@ class OptimizationFTest(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/transformers/tests/pipelines_test.py b/transformers/tests/pipelines_test.py
index 08a1507770..3c258594d1 100644
--- a/transformers/tests/pipelines_test.py
+++ b/transformers/tests/pipelines_test.py
@@ -6,58 +6,58 @@ from transformers import pipeline
 from transformers.tests.utils import require_tf, require_torch
 
 QA_FINETUNED_MODELS = {
-    ('bert-base-uncased', 'bert-large-uncased-whole-word-masking-finetuned-squad', None),
-    ('bert-base-cased', 'bert-large-cased-whole-word-masking-finetuned-squad', None),
-    ('bert-base-uncased', 'distilbert-base-uncased-distilled-squad', None)
+    ("bert-base-uncased", "bert-large-uncased-whole-word-masking-finetuned-squad", None),
+    ("bert-base-cased", "bert-large-cased-whole-word-masking-finetuned-squad", None),
+    ("bert-base-uncased", "distilbert-base-uncased-distilled-squad", None),
 }
 
 TF_QA_FINETUNED_MODELS = {
-    ('bert-base-uncased', 'bert-large-uncased-whole-word-masking-finetuned-squad', None),
-    ('bert-base-cased', 'bert-large-cased-whole-word-masking-finetuned-squad', None),
-    ('bert-base-uncased', 'distilbert-base-uncased-distilled-squad', None)
+    ("bert-base-uncased", "bert-large-uncased-whole-word-masking-finetuned-squad", None),
+    ("bert-base-cased", "bert-large-cased-whole-word-masking-finetuned-squad", None),
+    ("bert-base-uncased", "distilbert-base-uncased-distilled-squad", None),
 }
 
 TF_NER_FINETUNED_MODELS = {
     (
-        'bert-base-cased',
-        'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5',
-        'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json'
+        "bert-base-cased",
+        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5",
+        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json",
     )
 }
 
 NER_FINETUNED_MODELS = {
     (
-        'bert-base-cased',
-        'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin',
-        'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json'
+        "bert-base-cased",
+        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin",
+        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json",
     )
 }
 
 FEATURE_EXTRACT_FINETUNED_MODELS = {
-   ('bert-base-cased', 'bert-base-cased', None),
-   # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
-   ('distilbert-base-uncased', 'distilbert-base-uncased', None)
+    ("bert-base-cased", "bert-base-cased", None),
+    # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
+    ("distilbert-base-uncased", "distilbert-base-uncased", None),
 }
 
 TF_FEATURE_EXTRACT_FINETUNED_MODELS = {
-   ('bert-base-cased', 'bert-base-cased', None),
-   # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
-   ('distilbert-base-uncased', 'distilbert-base-uncased', None)
+    ("bert-base-cased", "bert-base-cased", None),
+    # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
+    ("distilbert-base-uncased", "distilbert-base-uncased", None),
 }
 
 TF_TEXT_CLASSIF_FINETUNED_MODELS = {
     (
-        'bert-base-uncased',
-        'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5',
-        'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json'
+        "bert-base-uncased",
+        "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5",
+        "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json",
     )
 }
 
 TEXT_CLASSIF_FINETUNED_MODELS = {
     (
-        'bert-base-uncased',
-        'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin',
-        'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json'
+        "bert-base-uncased",
+        "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin",
+        "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json",
     )
 }
 
@@ -91,54 +91,54 @@ class MonoColumnInputTestCase(unittest.TestCase):
 
     @require_torch
     def test_ner(self):
-        mandatory_keys = {'entity', 'word', 'score'}
-        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        mandatory_keys = {"entity", "word", "score"}
+        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
         invalid_inputs = [None]
         for tokenizer, model, config in NER_FINETUNED_MODELS:
-            nlp = pipeline(task='ner', model=model, config=config, tokenizer=tokenizer)
+            nlp = pipeline(task="ner", model=model, config=config, tokenizer=tokenizer)
             self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
 
     @require_tf
     def test_tf_ner(self):
-        mandatory_keys = {'entity', 'word', 'score'}
-        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        mandatory_keys = {"entity", "word", "score"}
+        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
         invalid_inputs = [None]
         for tokenizer, model, config in TF_NER_FINETUNED_MODELS:
-            nlp = pipeline(task='ner', model=model, config=config, tokenizer=tokenizer)
+            nlp = pipeline(task="ner", model=model, config=config, tokenizer=tokenizer)
             self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
 
     @require_torch
     def test_sentiment_analysis(self):
-        mandatory_keys = {'label'}
-        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        mandatory_keys = {"label"}
+        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
         invalid_inputs = [None]
         for tokenizer, model, config in TEXT_CLASSIF_FINETUNED_MODELS:
-            nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
+            nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer)
             self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
 
     @require_tf
     def test_tf_sentiment_analysis(self):
-        mandatory_keys = {'label'}
-        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        mandatory_keys = {"label"}
+        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
         invalid_inputs = [None]
         for tokenizer, model, config in TF_TEXT_CLASSIF_FINETUNED_MODELS:
-            nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
+            nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer)
             self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
 
     @require_torch
     def test_features_extraction(self):
-        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
         invalid_inputs = [None]
         for tokenizer, model, config in FEATURE_EXTRACT_FINETUNED_MODELS:
-            nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
+            nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer)
             self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {})
 
     @require_tf
     def test_tf_features_extraction(self):
-        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
         invalid_inputs = [None]
         for tokenizer, model, config in TF_FEATURE_EXTRACT_FINETUNED_MODELS:
-            nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
+            nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer)
             self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {})
 
 
@@ -165,46 +165,46 @@ class MultiColumnInputTestCase(unittest.TestCase):
 
     @require_torch
     def test_question_answering(self):
-        mandatory_output_keys = {'score', 'answer', 'start', 'end'}
+        mandatory_output_keys = {"score", "answer", "start", "end"}
         valid_samples = [
-            {'question': 'Where was HuggingFace founded ?', 'context': 'HuggingFace was founded in Paris.'},
+            {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
             {
-                'question': 'In what field is HuggingFace working ?',
-                'context': 'HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.'
-            }
+                "question": "In what field is HuggingFace working ?",
+                "context": "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.",
+            },
         ]
         invalid_samples = [
-            {'question': '', 'context': 'This is a test to try empty question edge case'},
-            {'question': None, 'context': 'This is a test to try empty question edge case'},
-            {'question': 'What is does with empty context ?', 'context': ''},
-            {'question': 'What is does with empty context ?', 'context': None},
+            {"question": "", "context": "This is a test to try empty question edge case"},
+            {"question": None, "context": "This is a test to try empty question edge case"},
+            {"question": "What is does with empty context ?", "context": ""},
+            {"question": "What is does with empty context ?", "context": None},
         ]
 
         for tokenizer, model, config in QA_FINETUNED_MODELS:
-            nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer)
+            nlp = pipeline(task="question-answering", model=model, config=config, tokenizer=tokenizer)
             self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys)
 
     @require_tf
     def test_tf_question_answering(self):
-        mandatory_output_keys = {'score', 'answer', 'start', 'end'}
+        mandatory_output_keys = {"score", "answer", "start", "end"}
         valid_samples = [
-            {'question': 'Where was HuggingFace founded ?', 'context': 'HuggingFace was founded in Paris.'},
+            {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
             {
-                'question': 'In what field is HuggingFace working ?',
-                'context': 'HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.'
-            }
+                "question": "In what field is HuggingFace working ?",
+                "context": "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.",
+            },
         ]
         invalid_samples = [
-            {'question': '', 'context': 'This is a test to try empty question edge case'},
-            {'question': None, 'context': 'This is a test to try empty question edge case'},
-            {'question': 'What is does with empty context ?', 'context': ''},
-            {'question': 'What is does with empty context ?', 'context': None},
+            {"question": "", "context": "This is a test to try empty question edge case"},
+            {"question": None, "context": "This is a test to try empty question edge case"},
+            {"question": "What is does with empty context ?", "context": ""},
+            {"question": "What is does with empty context ?", "context": None},
         ]
 
         for tokenizer, model, config in TF_QA_FINETUNED_MODELS:
-            nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer)
+            nlp = pipeline(task="question-answering", model=model, config=config, tokenizer=tokenizer)
             self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_albert_test.py b/transformers/tests/tokenization_albert_test.py
index 59eb3bceb0..7d7e793b54 100644
--- a/transformers/tests/tokenization_albert_test.py
+++ b/transformers/tests/tokenization_albert_test.py
@@ -17,12 +17,12 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 
-from transformers.tokenization_albert import (AlbertTokenizer, SPIECE_UNDERLINE)
+from transformers.tokenization_albert import AlbertTokenizer, SPIECE_UNDERLINE
 
 from .tokenization_tests_commons import CommonTestCases
 
-SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                    'fixtures/spiece.model')
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/spiece.model")
+
 
 class AlbertTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
@@ -39,27 +39,30 @@ class AlbertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         return AlbertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"this is a test"
-        output_text = u"this is a test"
+        input_text = "this is a test"
+        output_text = "this is a test"
         return input_text, output_text
 
-
     def test_full_tokenizer(self):
         tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)
 
-        tokens = tokenizer.tokenize(u'This is a test')
-        self.assertListEqual(tokens, [u'▁this', u'▁is', u'▁a', u'▁test'])
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"])
 
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289])
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
         self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289])
-
-        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, [u'▁i', u'▁was', u'▁born', u'▁in', u'▁9', u'2000', u',', u'▁and', u'▁this', u'▁is', u'▁fal', u's', u'é', u'.'])
+            tokens, ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "."]
+        )
         ids = tokenizer.convert_tokens_to_ids(tokens)
         self.assertListEqual(ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
 
         back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(back_tokens, ['▁i', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fal', 's', '<unk>', '.'])
+        self.assertListEqual(
+            back_tokens,
+            ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "."],
+        )
 
     def test_sequence_builders(self):
         tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
@@ -71,8 +74,10 @@ class AlbertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
 
         assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
-        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [tokenizer.sep_token_id]
+        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
+            tokenizer.sep_token_id
+        ]
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_auto_test.py b/transformers/tests/tokenization_auto_test.py
index 0a894cac04..7d77bf5b23 100644
--- a/transformers/tests/tokenization_auto_test.py
+++ b/transformers/tests/tokenization_auto_test.py
@@ -48,5 +48,6 @@ class AutoTokenizerTest(unittest.TestCase):
         self.assertIsInstance(tokenizer, BertTokenizer)
         self.assertEqual(len(tokenizer), 12)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_bert_japanese_test.py b/transformers/tests/tokenization_bert_japanese_test.py
index 545193c7cc..02eb8c0a66 100644
--- a/transformers/tests/tokenization_bert_japanese_test.py
+++ b/transformers/tests/tokenization_bert_japanese_test.py
@@ -19,9 +19,12 @@ import unittest
 from io import open
 
 from transformers.tokenization_bert import WordpieceTokenizer
-from transformers.tokenization_bert_japanese import (BertJapaneseTokenizer,
-                                                     MecabTokenizer, CharacterTokenizer,
-                                                     VOCAB_FILES_NAMES)
+from transformers.tokenization_bert_japanese import (
+    BertJapaneseTokenizer,
+    MecabTokenizer,
+    CharacterTokenizer,
+    VOCAB_FILES_NAMES,
+)
 
 from .tokenization_tests_commons import CommonTestCases
 from .utils import slow, custom_tokenizers
@@ -35,9 +38,24 @@ class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
     def setUp(self):
         super(BertJapaneseTokenizationTest, self).setUp()
 
-        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
-            u"こんにちは", u"こん", u"にちは", u"ばんは", u"##こん", u"##にちは", u"##ばんは",
-            u"世界", u"##世界", u"、", u"##、", u"。", u"##。"]
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "こんにちは",
+            "こん",
+            "にちは",
+            "ばんは",
+            "##こん",
+            "##にちは",
+            "##ばんは",
+            "世界",
+            "##世界",
+            "、",
+            "##、",
+            "。",
+            "##。",
+        ]
 
         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
         with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
@@ -47,70 +65,63 @@ class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
         return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"こんにちは、世界。 \nこんばんは、世界。"
-        output_text = u"こんにちは 、 世界 。 こんばんは 、 世界 。"
+        input_text = "こんにちは、世界。 \nこんばんは、世界。"
+        output_text = "こんにちは 、 世界 。 こんばんは 、 世界 。"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = self.tokenizer_class(self.vocab_file)
 
-        tokens = tokenizer.tokenize(u"こんにちは、世界。\nこんばんは、世界。")
-        self.assertListEqual(tokens,
-                             [u"こんにちは", u"、", u"世界", u"。",
-                              u"こん", u"##ばんは", u"、", u"世界", "。"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
-                             [3, 12, 10, 14, 4, 9, 12, 10, 14])
+        tokens = tokenizer.tokenize("こんにちは、世界。\nこんばんは、世界。")
+        self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
 
     def test_mecab_tokenizer(self):
         tokenizer = MecabTokenizer()
 
         self.assertListEqual(
-            tokenizer.tokenize(u" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
-                               [u"アップルストア", u"で", u"iPhone", u"8", u"が",
-                                u"発売", u"さ", u"れ", u"た", u"。"])
+            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+            ["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
+        )
 
     def test_mecab_tokenizer_lower(self):
         tokenizer = MecabTokenizer(do_lower_case=True)
 
         self.assertListEqual(
-            tokenizer.tokenize(u" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
-                               [u"アップルストア", u"で", u"iphone", u"8", u"が",
-                                u"発売", u"さ", u"れ", u"た", u"。"])
+            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+            ["アップルストア", "で", "iphone", "8", "が", "発売", "さ", "れ", "た", "。"],
+        )
 
     def test_mecab_tokenizer_no_normalize(self):
         tokenizer = MecabTokenizer(normalize_text=False)
 
         self.assertListEqual(
-            tokenizer.tokenize(u" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
-                               [u"ｱｯﾌﾟﾙストア", u"で", u"iPhone", u"８", u"が",
-                                u"発売", u"さ", u"れ", u"た", u"　", u"。"])
+            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+            ["ｱｯﾌﾟﾙストア", "で", "iPhone", "８", "が", "発売", "さ", "れ", "た", "　", "。"],
+        )
 
     def test_wordpiece_tokenizer(self):
-        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
-            u"こんにちは", u"こん", u"にちは" u"ばんは", u"##こん", u"##にちは", u"##ばんは"]
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは" "ばんは", "##こん", "##にちは", "##ばんは"]
 
         vocab = {}
         for (i, token) in enumerate(vocab_tokens):
             vocab[token] = i
-        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token=u"[UNK]")
+        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
 
-        self.assertListEqual(tokenizer.tokenize(u""), [])
+        self.assertListEqual(tokenizer.tokenize(""), [])
 
-        self.assertListEqual(tokenizer.tokenize(u"こんにちは"),
-                             [u"こんにちは"])
+        self.assertListEqual(tokenizer.tokenize("こんにちは"), ["こんにちは"])
 
-        self.assertListEqual(tokenizer.tokenize(u"こんばんは"),
-                             [u"こん", u"##ばんは"])
+        self.assertListEqual(tokenizer.tokenize("こんばんは"), ["こん", "##ばんは"])
 
-        self.assertListEqual(tokenizer.tokenize(u"こんばんは こんばんにちは こんにちは"),
-                             [u"こん", u"##ばんは", u"[UNK]", u"こんにちは"])
+        self.assertListEqual(tokenizer.tokenize("こんばんは こんばんにちは こんにちは"), ["こん", "##ばんは", "[UNK]", "こんにちは"])
 
     @slow
     def test_sequence_builders(self):
         tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese")
 
-        text = tokenizer.encode(u"ありがとう。", add_special_tokens=False)
-        text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False)
+        text = tokenizer.encode("ありがとう。", add_special_tokens=False)
+        text_2 = tokenizer.encode("どういたしまして。", add_special_tokens=False)
 
         encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
@@ -127,58 +138,51 @@ class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTeste
     def setUp(self):
         super(BertJapaneseCharacterTokenizationTest, self).setUp()
 
-        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
-            u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界", u"、", u"。"]
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界", "、", "。"]
 
         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
         with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_tokenizer(self, **kwargs):
-        return BertJapaneseTokenizer.from_pretrained(self.tmpdirname,
-                                                     subword_tokenizer_type="character",
-                                                     **kwargs)
+        return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, subword_tokenizer_type="character", **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"こんにちは、世界。 \nこんばんは、世界。"
-        output_text = u"こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
+        input_text = "こんにちは、世界。 \nこんばんは、世界。"
+        output_text = "こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
         return input_text, output_text
 
     def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file,
-                                         subword_tokenizer_type="character")
+        tokenizer = self.tokenizer_class(self.vocab_file, subword_tokenizer_type="character")
 
-        tokens = tokenizer.tokenize(u"こんにちは、世界。 \nこんばんは、世界。")
-        self.assertListEqual(tokens,
-            [u"こ", u"ん", u"に", u"ち", u"は", u"、", u"世", u"界", u"。",
-             u"こ", u"ん", u"ば", u"ん", u"は", u"、", u"世", u"界", u"。"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
-                             [3, 4, 5, 6, 7, 11, 9, 10, 12,
-                              3, 4, 8, 4, 7, 11, 9, 10, 12])
+        tokens = tokenizer.tokenize("こんにちは、世界。 \nこんばんは、世界。")
+        self.assertListEqual(
+            tokens, ["こ", "ん", "に", "ち", "は", "、", "世", "界", "。", "こ", "ん", "ば", "ん", "は", "、", "世", "界", "。"]
+        )
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens), [3, 4, 5, 6, 7, 11, 9, 10, 12, 3, 4, 8, 4, 7, 11, 9, 10, 12]
+        )
 
     def test_character_tokenizer(self):
-        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
-            u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界"u"、", u"。"]
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界" "、", "。"]
 
         vocab = {}
         for (i, token) in enumerate(vocab_tokens):
             vocab[token] = i
-        tokenizer = CharacterTokenizer(vocab=vocab, unk_token=u"[UNK]")
+        tokenizer = CharacterTokenizer(vocab=vocab, unk_token="[UNK]")
 
-        self.assertListEqual(tokenizer.tokenize(u""), [])
+        self.assertListEqual(tokenizer.tokenize(""), [])
 
-        self.assertListEqual(tokenizer.tokenize(u"こんにちは"),
-                             [u"こ", u"ん", u"に", u"ち", u"は"])
+        self.assertListEqual(tokenizer.tokenize("こんにちは"), ["こ", "ん", "に", "ち", "は"])
 
-        self.assertListEqual(tokenizer.tokenize(u"こんにちほ"),
-                             [u"こ", u"ん", u"に", u"ち", u"[UNK]"])
+        self.assertListEqual(tokenizer.tokenize("こんにちほ"), ["こ", "ん", "に", "ち", "[UNK]"])
 
     @slow
     def test_sequence_builders(self):
         tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese-char")
 
-        text = tokenizer.encode(u"ありがとう。", add_special_tokens=False)
-        text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False)
+        text = tokenizer.encode("ありがとう。", add_special_tokens=False)
+        text_2 = tokenizer.encode("どういたしまして。", add_special_tokens=False)
 
         encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
@@ -186,6 +190,3 @@ class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTeste
         # 2 is for "[CLS]", 3 is for "[SEP]"
         assert encoded_sentence == [2] + text + [3]
         assert encoded_pair == [2] + text + [3] + text_2 + [3]
-
-
-
diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py
index c503ea5e1e..bf023761a6 100644
--- a/transformers/tests/tokenization_bert_test.py
+++ b/transformers/tests/tokenization_bert_test.py
@@ -18,15 +18,20 @@ import os
 import unittest
 from io import open
 
-from transformers.tokenization_bert import (BasicTokenizer,
-                                                    BertTokenizer,
-                                                    WordpieceTokenizer,
-                                                    _is_control, _is_punctuation,
-                                                    _is_whitespace, VOCAB_FILES_NAMES)
+from transformers.tokenization_bert import (
+    BasicTokenizer,
+    BertTokenizer,
+    WordpieceTokenizer,
+    _is_control,
+    _is_punctuation,
+    _is_whitespace,
+    VOCAB_FILES_NAMES,
+)
 
 from .tokenization_tests_commons import CommonTestCases
 from .utils import slow
 
+
 class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
     tokenizer_class = BertTokenizer
@@ -35,55 +40,61 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         super(BertTokenizationTest, self).setUp()
 
         vocab_tokens = [
-            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
-            "##ing", ",", "low", "lowest",
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
         ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-        with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_tokenizer(self, **kwargs):
         return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"UNwant\u00E9d,running"
-        output_text = u"unwanted, running"
+        input_text = "UNwant\u00E9d,running"
+        output_text = "unwanted, running"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = self.tokenizer_class(self.vocab_file)
 
-        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
+        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 
     def test_chinese(self):
         tokenizer = BasicTokenizer()
 
-        self.assertListEqual(
-            tokenizer.tokenize(u"ah\u535A\u63A8zz"),
-            [u"ah", u"\u535A", u"\u63A8", u"zz"])
+        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
 
     def test_basic_tokenizer_lower(self):
         tokenizer = BasicTokenizer(do_lower_case=True)
 
         self.assertListEqual(
-            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
-            ["hello", "!", "how", "are", "you", "?"])
-        self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
 
     def test_basic_tokenizer_no_lower(self):
         tokenizer = BasicTokenizer(do_lower_case=False)
 
         self.assertListEqual(
-            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
-            ["HeLLo", "!", "how", "Are", "yoU", "?"])
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
+        )
 
     def test_wordpiece_tokenizer(self):
-        vocab_tokens = [
-            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
-            "##ing"
-        ]
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
 
         vocab = {}
         for (i, token) in enumerate(vocab_tokens):
@@ -92,39 +103,36 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
         self.assertListEqual(tokenizer.tokenize(""), [])
 
-        self.assertListEqual(
-            tokenizer.tokenize("unwanted running"),
-            ["un", "##want", "##ed", "runn", "##ing"])
+        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
 
-        self.assertListEqual(
-            tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
+        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
 
     def test_is_whitespace(self):
-        self.assertTrue(_is_whitespace(u" "))
-        self.assertTrue(_is_whitespace(u"\t"))
-        self.assertTrue(_is_whitespace(u"\r"))
-        self.assertTrue(_is_whitespace(u"\n"))
-        self.assertTrue(_is_whitespace(u"\u00A0"))
+        self.assertTrue(_is_whitespace(" "))
+        self.assertTrue(_is_whitespace("\t"))
+        self.assertTrue(_is_whitespace("\r"))
+        self.assertTrue(_is_whitespace("\n"))
+        self.assertTrue(_is_whitespace("\u00A0"))
 
-        self.assertFalse(_is_whitespace(u"A"))
-        self.assertFalse(_is_whitespace(u"-"))
+        self.assertFalse(_is_whitespace("A"))
+        self.assertFalse(_is_whitespace("-"))
 
     def test_is_control(self):
-        self.assertTrue(_is_control(u"\u0005"))
+        self.assertTrue(_is_control("\u0005"))
 
-        self.assertFalse(_is_control(u"A"))
-        self.assertFalse(_is_control(u" "))
-        self.assertFalse(_is_control(u"\t"))
-        self.assertFalse(_is_control(u"\r"))
+        self.assertFalse(_is_control("A"))
+        self.assertFalse(_is_control(" "))
+        self.assertFalse(_is_control("\t"))
+        self.assertFalse(_is_control("\r"))
 
     def test_is_punctuation(self):
-        self.assertTrue(_is_punctuation(u"-"))
-        self.assertTrue(_is_punctuation(u"$"))
-        self.assertTrue(_is_punctuation(u"`"))
-        self.assertTrue(_is_punctuation(u"."))
+        self.assertTrue(_is_punctuation("-"))
+        self.assertTrue(_is_punctuation("$"))
+        self.assertTrue(_is_punctuation("`"))
+        self.assertTrue(_is_punctuation("."))
 
-        self.assertFalse(_is_punctuation(u"A"))
-        self.assertFalse(_is_punctuation(u" "))
+        self.assertFalse(_is_punctuation("A"))
+        self.assertFalse(_is_punctuation(" "))
 
     @slow
     def test_sequence_builders(self):
@@ -140,5 +148,5 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         assert encoded_pair == [101] + text + [102] + text_2 + [102]
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_ctrl_test.py b/transformers/tests/tokenization_ctrl_test.py
index ad16cf07fa..04c9dec523 100644
--- a/transformers/tests/tokenization_ctrl_test.py
+++ b/transformers/tests/tokenization_ctrl_test.py
@@ -22,6 +22,7 @@ from transformers.tokenization_ctrl import CTRLTokenizer, VOCAB_FILES_NAMES
 
 from .tokenization_tests_commons import CommonTestCases
 
+
 class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
     tokenizer_class = CTRLTokenizer
@@ -30,13 +31,13 @@ class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester):
         super(CTRLTokenizationTest, self).setUp()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = ['adapt', 're@@', 'a@@', 'apt', 'c@@', 't', '<unk>']
+        vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", "<unk>"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", 'a p', 'ap t</w>', 'r e', 'a d', 'ad apt</w>', '']
+        merges = ["#version: 0.2", "a p", "ap t</w>", "r e", "a d", "ad apt</w>", ""]
         self.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
         with open(self.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
         with open(self.merges_file, "w", encoding="utf-8") as fp:
@@ -47,23 +48,22 @@ class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester):
         return CTRLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"adapt react readapt apt"
-        output_text = u"adapt react readapt apt"
+        input_text = "adapt react readapt apt"
+        output_text = "adapt react readapt apt"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = CTRLTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
         text = "adapt react readapt apt"
-        bpe_tokens = 'adapt re@@ a@@ c@@ t re@@ adapt apt'.split()
+        bpe_tokens = "adapt re@@ a@@ c@@ t re@@ adapt apt".split()
         tokens = tokenizer.tokenize(text)
         self.assertListEqual(tokens, bpe_tokens)
 
         input_tokens = tokens + [tokenizer.unk_token]
 
         input_bpe_tokens = [0, 1, 2, 4, 5, 1, 0, 3, 6]
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_distilbert_test.py b/transformers/tests/tokenization_distilbert_test.py
index e815eca672..551f9e188e 100644
--- a/transformers/tests/tokenization_distilbert_test.py
+++ b/transformers/tests/tokenization_distilbert_test.py
@@ -18,12 +18,13 @@ import os
 import unittest
 from io import open
 
-from transformers.tokenization_distilbert import (DistilBertTokenizer)
+from transformers.tokenization_distilbert import DistilBertTokenizer
 
 from .tokenization_tests_commons import CommonTestCases
 from .tokenization_bert_test import BertTokenizationTest
 from .utils import slow
 
+
 class DistilBertTokenizationTest(BertTokenizationTest):
 
     tokenizer_class = DistilBertTokenizer
@@ -42,9 +43,10 @@ class DistilBertTokenizationTest(BertTokenizationTest):
         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
 
         assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
-        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + \
-               text_2 + [tokenizer.sep_token_id]
+        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
+            tokenizer.sep_token_id
+        ]
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_gpt2_test.py b/transformers/tests/tokenization_gpt2_test.py
index 5eae767bdf..552b73416e 100644
--- a/transformers/tests/tokenization_gpt2_test.py
+++ b/transformers/tests/tokenization_gpt2_test.py
@@ -23,6 +23,7 @@ from transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
 
 from .tokenization_tests_commons import CommonTestCases
 
+
 class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
 
     tokenizer_class = GPT2Tokenizer
@@ -31,16 +32,34 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
         super(GPT2TokenizationTest, self).setUp()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
-                 "\u0120", "\u0120l", "\u0120n",
-                 "\u0120lo", "\u0120low", "er",
-                 "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "\u0120",
+            "\u0120l",
+            "\u0120n",
+            "\u0120lo",
+            "\u0120low",
+            "er",
+            "\u0120lowest",
+            "\u0120newer",
+            "\u0120wider",
+            "<unk>",
+        ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
         self.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
         with open(self.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
         with open(self.merges_file, "w", encoding="utf-8") as fp:
@@ -51,8 +70,8 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
         return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"lower newer"
-        output_text = u"lower newer"
+        input_text = "lower newer"
+        output_text = "lower newer"
         return input_text, output_text
 
     def test_full_tokenizer(self):
@@ -64,8 +83,8 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
 
         input_tokens = tokens + [tokenizer.unk_token]
         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_openai_test.py b/transformers/tests/tokenization_openai_test.py
index 56aa219ddc..c6a802b7be 100644
--- a/transformers/tests/tokenization_openai_test.py
+++ b/transformers/tests/tokenization_openai_test.py
@@ -31,15 +31,34 @@ class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
         super(OpenAIGPTTokenizationTest, self).setUp()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
-                 "w</w>", "r</w>", "t</w>",
-                 "lo", "low", "er</w>",
-                 "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "w</w>",
+            "r</w>",
+            "t</w>",
+            "lo",
+            "low",
+            "er</w>",
+            "low</w>",
+            "lowest</w>",
+            "newer</w>",
+            "wider</w>",
+            "<unk>",
+        ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
         with open(self.vocab_file, "w") as fp:
             fp.write(json.dumps(vocab_tokens))
         with open(self.merges_file, "w") as fp:
@@ -49,11 +68,10 @@ class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
         return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"lower newer"
-        output_text = u"lower newer"
+        input_text = "lower newer"
+        output_text = "lower newer"
         return input_text, output_text
 
-
     def test_full_tokenizer(self):
         tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file)
 
@@ -64,9 +82,8 @@ class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
         input_tokens = tokens + ["<unk>"]
         input_bpe_tokens = [14, 15, 20]
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_roberta_test.py b/transformers/tests/tokenization_roberta_test.py
index 8ad0b59511..a1d9d5fb72 100644
--- a/transformers/tests/tokenization_roberta_test.py
+++ b/transformers/tests/tokenization_roberta_test.py
@@ -31,16 +31,34 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
         super(RobertaTokenizationTest, self).setUp()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
-                 "\u0120", "\u0120l", "\u0120n",
-                 "\u0120lo", "\u0120low", "er",
-                 "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "\u0120",
+            "\u0120l",
+            "\u0120n",
+            "\u0120lo",
+            "\u0120low",
+            "er",
+            "\u0120lowest",
+            "\u0120newer",
+            "\u0120wider",
+            "<unk>",
+        ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
         self.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
         with open(self.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
         with open(self.merges_file, "w", encoding="utf-8") as fp:
@@ -51,8 +69,8 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
         return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"lower newer"
-        output_text = u"lower newer"
+        input_text = "lower newer"
+        output_text = "lower newer"
         return input_text, output_text
 
     def test_full_tokenizer(self):
@@ -64,19 +82,15 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
         input_tokens = tokens + [tokenizer.unk_token]
         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
     def roberta_dict_integration_testing(self):
         tokenizer = self.get_tokenizer()
 
+        self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2])
         self.assertListEqual(
-            tokenizer.encode('Hello world!', add_special_tokens=False),
-            [0, 31414, 232, 328, 2]
-        )
-        self.assertListEqual(
-            tokenizer.encode('Hello world! cécé herlolip 418', add_special_tokens=False),
-            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
+            tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False),
+            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2],
         )
 
     @slow
@@ -87,7 +101,9 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
         text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
 
         encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
-        encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
+        encoded_pair_from_decode = tokenizer.encode(
+            "sequence builders", "multi-sequence build", add_special_tokens=True
+        )
 
         encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
@@ -96,5 +112,5 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
         assert encoded_pair == encoded_pair_from_decode
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_t5_test.py b/transformers/tests/tokenization_t5_test.py
index 0b4f960e32..09bc0267f1 100644
--- a/transformers/tests/tokenization_t5_test.py
+++ b/transformers/tests/tokenization_t5_test.py
@@ -17,13 +17,13 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 
-from transformers.tokenization_t5 import (T5Tokenizer)
+from transformers.tokenization_t5 import T5Tokenizer
 from transformers.tokenization_xlnet import SPIECE_UNDERLINE
 
 from .tokenization_tests_commons import CommonTestCases
 
-SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                    'fixtures/test_sentencepiece.model')
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+
 
 class T5TokenizationTest(CommonTestCases.CommonTokenizerTester):
 
@@ -40,38 +40,76 @@ class T5TokenizationTest(CommonTestCases.CommonTokenizerTester):
         return T5Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"This is a test"
-        output_text = u"This is a test"
+        input_text = "This is a test"
+        output_text = "This is a test"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = T5Tokenizer(SAMPLE_VOCAB)
 
-        tokens = tokenizer.tokenize(u'This is a test')
-        self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
 
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
         self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
-
-        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
-                                    u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
-                                    u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                    SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "é",
+                ".",
+            ],
+        )
         ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(
-            ids, [8, 21, 84, 55, 24, 19, 7, 0,
-                602, 347, 347, 347, 3, 12, 66,
-                46, 72, 80, 6, 0, 4])
+        self.assertListEqual(ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4])
 
         back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
-                                        u'or', u'n', SPIECE_UNDERLINE + u'in',
-                                        SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
-                                        SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                        SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
-                                        u'<unk>', u'.'])
+        self.assertListEqual(
+            back_tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "<unk>",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "<unk>",
+                ".",
+            ],
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index c417d033dc..ba81101084 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -26,19 +26,23 @@ if sys.version_info[0] == 2:
 
     class TemporaryDirectory(object):
         """Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
+
         def __enter__(self):
             self.name = tempfile.mkdtemp()
             return self.name
+
         def __exit__(self, exc_type, exc_value, traceback):
             shutil.rmtree(self.name)
+
+
 else:
     import pickle
+
     TemporaryDirectory = tempfile.TemporaryDirectory
     unicode = str
 
 
 class CommonTestCases:
-
     class CommonTokenizerTester(unittest.TestCase):
 
         tokenizer_class = None
@@ -57,17 +61,23 @@ class CommonTestCases:
 
         def test_tokenizers_common_properties(self):
             tokenizer = self.get_tokenizer()
-            attributes_list = ["bos_token", "eos_token", "unk_token", "sep_token",
-                                "pad_token", "cls_token", "mask_token"]
+            attributes_list = [
+                "bos_token",
+                "eos_token",
+                "unk_token",
+                "sep_token",
+                "pad_token",
+                "cls_token",
+                "mask_token",
+            ]
             for attr in attributes_list:
                 self.assertTrue(hasattr(tokenizer, attr))
                 self.assertTrue(hasattr(tokenizer, attr + "_id"))
 
             self.assertTrue(hasattr(tokenizer, "additional_special_tokens"))
-            self.assertTrue(hasattr(tokenizer, 'additional_special_tokens_ids'))
+            self.assertTrue(hasattr(tokenizer, "additional_special_tokens_ids"))
 
-            attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder",
-                                "added_tokens_decoder"]
+            attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder", "added_tokens_decoder"]
             for attr in attributes_list:
                 self.assertTrue(hasattr(tokenizer, attr))
 
@@ -79,13 +89,13 @@ class CommonTestCases:
             # Now let's start the test
             tokenizer = self.get_tokenizer(max_len=42)
 
-            before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
+            before_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
 
             with TemporaryDirectory() as tmpdirname:
                 tokenizer.save_pretrained(tmpdirname)
                 tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
 
-                after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
+                after_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
                 self.assertListEqual(before_tokens, after_tokens)
 
                 self.assertEqual(tokenizer.max_len, 42)
@@ -96,12 +106,12 @@ class CommonTestCases:
             tokenizer = self.get_tokenizer()
             self.assertIsNotNone(tokenizer)
 
-            text = u"Munich and Berlin are nice cities"
+            text = "Munich and Berlin are nice cities"
             subwords = tokenizer.tokenize(text)
 
             with TemporaryDirectory() as tmpdirname:
 
-                filename = os.path.join(tmpdirname, u"tokenizer.bin")
+                filename = os.path.join(tmpdirname, "tokenizer.bin")
                 with open(filename, "wb") as handle:
                     pickle.dump(tokenizer, handle)
 
@@ -122,7 +132,7 @@ class CommonTestCases:
 
             toks0 = tokenizer.tokenize(text)  # toks before adding new_toks
 
-            new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", 'AAAAA BBBBBB', 'CCCCCCCCCDDDDDDDD']
+            new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
             added = tokenizer.add_tokens(new_toks)
             self.assertEqual(added, 2)
 
@@ -178,8 +188,7 @@ class CommonTestCases:
             self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
             self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
 
-            new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
-                          'pad_token': "<<<<<|||>|>>>>|>"}
+            new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
             added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
             vocab_size_3 = tokenizer.vocab_size
             all_size_3 = len(tokenizer)
@@ -189,8 +198,9 @@ class CommonTestCases:
             self.assertEqual(added_toks_2, len(new_toks_2))
             self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
 
-            tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
-                                      add_special_tokens=False)
+            tokens = tokenizer.encode(
+                ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
+            )
             out_string = tokenizer.decode(tokens)
 
             self.assertGreaterEqual(len(tokens), 6)
@@ -242,7 +252,7 @@ class CommonTestCases:
         def test_encode_decode_with_spaces(self):
             tokenizer = self.get_tokenizer()
 
-            new_toks = ['[ABC]', '[DEF]', 'GHI IHG']
+            new_toks = ["[ABC]", "[DEF]", "GHI IHG"]
             tokenizer.add_tokens(new_toks)
             input = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
             encoded = tokenizer.encode(input, add_special_tokens=False)
@@ -264,7 +274,7 @@ class CommonTestCases:
 
             tokenizer = self.get_tokenizer()
 
-            if tokenizer.build_inputs_with_special_tokens.__qualname__.split('.')[0] != "PreTrainedTokenizer":
+            if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer":
                 seq_0 = "Test this method."
                 seq_1 = "With these inputs."
                 information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
@@ -293,17 +303,19 @@ class CommonTestCases:
             sequence = tokenizer.encode(seq_0, add_special_tokens=False)
             num_added_tokens = tokenizer.num_added_tokens()
             total_length = len(sequence) + num_added_tokens
-            information = tokenizer.encode_plus(seq_0,
-                                                max_length=total_length - 2,
-                                                add_special_tokens=True,
-                                                stride=stride,
-                                                return_overflowing_tokens=True)
+            information = tokenizer.encode_plus(
+                seq_0,
+                max_length=total_length - 2,
+                add_special_tokens=True,
+                stride=stride,
+                return_overflowing_tokens=True,
+            )
 
             truncated_sequence = information["input_ids"]
             overflowing_tokens = information["overflowing_tokens"]
 
             self.assertEqual(len(overflowing_tokens), 2 + stride)
-            self.assertEqual(overflowing_tokens, sequence[-(2 + stride):])
+            self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :])
             self.assertEqual(len(truncated_sequence), total_length - 2)
             self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2]))
 
@@ -320,24 +332,35 @@ class CommonTestCases:
             sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
             truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
                 tokenizer.encode(seq_0, add_special_tokens=False),
-                tokenizer.encode(seq_1, add_special_tokens=False)[:-2]
+                tokenizer.encode(seq_1, add_special_tokens=False)[:-2],
             )
 
-            information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
-                                                stride=stride, truncation_strategy='only_second',
-                                                return_overflowing_tokens=True)
-            information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2,
-                                                                add_special_tokens=True, stride=stride,
-                                                                truncation_strategy='only_first',
-                                                                return_overflowing_tokens=True)
+            information = tokenizer.encode_plus(
+                seq_0,
+                seq_1,
+                max_length=len(sequence) - 2,
+                add_special_tokens=True,
+                stride=stride,
+                truncation_strategy="only_second",
+                return_overflowing_tokens=True,
+            )
+            information_first_truncated = tokenizer.encode_plus(
+                seq_0,
+                seq_1,
+                max_length=len(sequence) - 2,
+                add_special_tokens=True,
+                stride=stride,
+                truncation_strategy="only_first",
+                return_overflowing_tokens=True,
+            )
 
             truncated_sequence = information["input_ids"]
             overflowing_tokens = information["overflowing_tokens"]
             overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
 
             self.assertEqual(len(overflowing_tokens), 2 + stride)
-            self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride):])
-            self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride):])
+            self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride) :])
+            self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride) :])
             self.assertEqual(len(truncated_sequence), len(sequence) - 2)
             self.assertEqual(truncated_sequence, truncated_second_sequence)
 
@@ -361,37 +384,47 @@ class CommonTestCases:
 
             # Testing single inputs
             encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
-            encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True, return_special_tokens_mask=True)
+            encoded_sequence_dict = tokenizer.encode_plus(
+                sequence_0, add_special_tokens=True, return_special_tokens_mask=True
+            )
             encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
             special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
             self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
 
-            filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
+            filtered_sequence = [
+                (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
+            ]
             filtered_sequence = [x for x in filtered_sequence if x is not None]
             self.assertEqual(encoded_sequence, filtered_sequence)
 
             # Testing inputs pairs
-            encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(sequence_1,
-                                                                                                         add_special_tokens=False)
-            encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True,
-                                                          return_special_tokens_mask=True)
+            encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(
+                sequence_1, add_special_tokens=False
+            )
+            encoded_sequence_dict = tokenizer.encode_plus(
+                sequence_0, sequence_1, add_special_tokens=True, return_special_tokens_mask=True
+            )
             encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
             special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
             self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
 
-            filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
+            filtered_sequence = [
+                (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
+            ]
             filtered_sequence = [x for x in filtered_sequence if x is not None]
             self.assertEqual(encoded_sequence, filtered_sequence)
 
             # Testing with already existing special tokens
             if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
-                tokenizer.add_special_tokens({'cls_token': '</s>', 'sep_token': '<s>'})
-            encoded_sequence_dict = tokenizer.encode_plus(sequence_0,
-                                                          add_special_tokens=True,
-                                                          return_special_tokens_mask=True)
+                tokenizer.add_special_tokens({"cls_token": "</s>", "sep_token": "<s>"})
+            encoded_sequence_dict = tokenizer.encode_plus(
+                sequence_0, add_special_tokens=True, return_special_tokens_mask=True
+            )
             encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
             special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
-            special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, already_has_special_tokens=True)
+            special_tokens_mask = tokenizer.get_special_tokens_mask(
+                encoded_sequence_w_special, already_has_special_tokens=True
+            )
             self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
             self.assertEqual(special_tokens_mask_orig, special_tokens_mask)
 
@@ -406,7 +439,9 @@ class CommonTestCases:
             tokenizer.padding_side = "right"
             encoded_sequence = tokenizer.encode(sequence)
             sequence_length = len(encoded_sequence)
-            padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
+            padded_sequence = tokenizer.encode(
+                sequence, max_length=sequence_length + padding_size, pad_to_max_length=True
+            )
             padded_sequence_length = len(padded_sequence)
             assert sequence_length + padding_size == padded_sequence_length
             assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
@@ -415,7 +450,9 @@ class CommonTestCases:
             tokenizer.padding_side = "left"
             encoded_sequence = tokenizer.encode(sequence)
             sequence_length = len(encoded_sequence)
-            padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
+            padded_sequence = tokenizer.encode(
+                sequence, max_length=sequence_length + padding_size, pad_to_max_length=True
+            )
             padded_sequence_length = len(padded_sequence)
             assert sequence_length + padding_size == padded_sequence_length
             assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
@@ -446,38 +483,48 @@ class CommonTestCases:
             token_type_padding_idx = tokenizer.pad_token_type_id
 
             encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
-            input_ids = encoded_sequence['input_ids']
-            token_type_ids = encoded_sequence['token_type_ids']
-            attention_mask = encoded_sequence['attention_mask']
-            special_tokens_mask = encoded_sequence['special_tokens_mask']
+            input_ids = encoded_sequence["input_ids"]
+            token_type_ids = encoded_sequence["token_type_ids"]
+            attention_mask = encoded_sequence["attention_mask"]
+            special_tokens_mask = encoded_sequence["special_tokens_mask"]
             sequence_length = len(input_ids)
 
             # Test right padding
             tokenizer.padding_side = "right"
-            padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True)
-            padded_input_ids = padded_sequence['input_ids']
-            padded_token_type_ids = padded_sequence['token_type_ids']
-            padded_attention_mask = padded_sequence['attention_mask']
-            padded_special_tokens_mask = padded_sequence['special_tokens_mask']
+            padded_sequence = tokenizer.encode_plus(
+                sequence,
+                max_length=sequence_length + padding_size,
+                pad_to_max_length=True,
+                return_special_tokens_mask=True,
+            )
+            padded_input_ids = padded_sequence["input_ids"]
+            padded_token_type_ids = padded_sequence["token_type_ids"]
+            padded_attention_mask = padded_sequence["attention_mask"]
+            padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
             padded_sequence_length = len(padded_input_ids)
 
             assert sequence_length + padding_size == padded_sequence_length
             assert input_ids + [padding_idx] * padding_size == padded_input_ids
             assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
-            assert attention_mask + [0] * padding_size == padded_attention_mask 
-            assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask 
+            assert attention_mask + [0] * padding_size == padded_attention_mask
+            assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask
 
             # Test left padding
             tokenizer.padding_side = "left"
-            padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True)
-            padded_input_ids = padded_sequence['input_ids']
-            padded_token_type_ids = padded_sequence['token_type_ids']
-            padded_attention_mask = padded_sequence['attention_mask']
-            padded_special_tokens_mask = padded_sequence['special_tokens_mask']
+            padded_sequence = tokenizer.encode_plus(
+                sequence,
+                max_length=sequence_length + padding_size,
+                pad_to_max_length=True,
+                return_special_tokens_mask=True,
+            )
+            padded_input_ids = padded_sequence["input_ids"]
+            padded_token_type_ids = padded_sequence["token_type_ids"]
+            padded_attention_mask = padded_sequence["attention_mask"]
+            padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
             padded_sequence_length = len(padded_input_ids)
 
             assert sequence_length + padding_size == padded_sequence_length
             assert [padding_idx] * padding_size + input_ids == padded_input_ids
             assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids
-            assert [0] * padding_size + attention_mask == padded_attention_mask 
-            assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask 
\ No newline at end of file
+            assert [0] * padding_size + attention_mask == padded_attention_mask
+            assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask
diff --git a/transformers/tests/tokenization_transfo_xl_test.py b/transformers/tests/tokenization_transfo_xl_test.py
index 5495ebd3a6..8b737283da 100644
--- a/transformers/tests/tokenization_transfo_xl_test.py
+++ b/transformers/tests/tokenization_transfo_xl_test.py
@@ -37,45 +37,53 @@ class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
         super(TransfoXLTokenizationTest, self).setUp()
 
         vocab_tokens = [
-            "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un",
-            "running", ",", "low", "l",
+            "<unk>",
+            "[CLS]",
+            "[SEP]",
+            "want",
+            "unwanted",
+            "wa",
+            "un",
+            "running",
+            ",",
+            "low",
+            "l",
         ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-        with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_tokenizer(self, **kwargs):
-        kwargs['lower_case'] = True
+        kwargs["lower_case"] = True
         return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"<unk> UNwanted , running"
-        output_text = u"<unk> unwanted, running"
+        input_text = "<unk> UNwanted , running"
+        output_text = "<unk> unwanted, running"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True)
 
-        tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
+        tokens = tokenizer.tokenize("<unk> UNwanted , running")
         self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
 
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
 
     def test_full_tokenizer_lower(self):
         tokenizer = TransfoXLTokenizer(lower_case=True)
 
         self.assertListEqual(
-            tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
-            ["hello", "!", "how", "are", "you", "?"])
+            tokenizer.tokenize(" \tHeLLo ! how  \n Are yoU ?  "), ["hello", "!", "how", "are", "you", "?"]
+        )
 
     def test_full_tokenizer_no_lower(self):
         tokenizer = TransfoXLTokenizer(lower_case=False)
 
         self.assertListEqual(
-            tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
-            ["HeLLo", "!", "how", "Are", "yoU", "?"])
+            tokenizer.tokenize(" \tHeLLo ! how  \n Are yoU ?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_utils_test.py b/transformers/tests/tokenization_utils_test.py
index ff3f80ff7d..4fa92c44bf 100644
--- a/transformers/tests/tokenization_utils_test.py
+++ b/transformers/tests/tokenization_utils_test.py
@@ -24,8 +24,8 @@ from transformers.tokenization_gpt2 import GPT2Tokenizer
 
 from .utils import slow
 
-class TokenizerUtilsTest(unittest.TestCase):
 
+class TokenizerUtilsTest(unittest.TestCase):
     def check_tokenizer_from_pretrained(self, tokenizer_class):
         s3_models = list(tokenizer_class.max_model_input_sizes.keys())
         for model_name in s3_models[:1]:
@@ -46,5 +46,6 @@ class TokenizerUtilsTest(unittest.TestCase):
     def test_pretrained_tokenizers(self):
         self.check_tokenizer_from_pretrained(GPT2Tokenizer)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_xlm_test.py b/transformers/tests/tokenization_xlm_test.py
index 7582a46662..e9aa2b7d0e 100644
--- a/transformers/tests/tokenization_xlm_test.py
+++ b/transformers/tests/tokenization_xlm_test.py
@@ -23,6 +23,7 @@ from transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
 from .tokenization_tests_commons import CommonTestCases
 from .utils import slow
 
+
 class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
     tokenizer_class = XLMTokenizer
@@ -31,15 +32,34 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
         super(XLMTokenizationTest, self).setUp()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
-                 "w</w>", "r</w>", "t</w>",
-                 "lo", "low", "er</w>",
-                 "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "w</w>",
+            "r</w>",
+            "t</w>",
+            "lo",
+            "low",
+            "er</w>",
+            "low</w>",
+            "lowest</w>",
+            "newer</w>",
+            "wider</w>",
+            "<unk>",
+        ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
         with open(self.vocab_file, "w") as fp:
             fp.write(json.dumps(vocab_tokens))
         with open(self.merges_file, "w") as fp:
@@ -49,8 +69,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
         return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"lower newer"
-        output_text = u"lower newer"
+        input_text = "lower newer"
+        output_text = "lower newer"
         return input_text, output_text
 
     def test_full_tokenizer(self):
@@ -64,8 +84,7 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
         input_tokens = tokens + ["<unk>"]
         input_bpe_tokens = [14, 15, 20]
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
     @slow
     def test_sequence_builders(self):
@@ -80,5 +99,6 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
         assert encoded_sentence == [1] + text + [1]
         assert encoded_pair == [1] + text + [1] + text_2 + [1]
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_xlnet_test.py b/transformers/tests/tokenization_xlnet_test.py
index b68495a796..32482449a4 100644
--- a/transformers/tests/tokenization_xlnet_test.py
+++ b/transformers/tests/tokenization_xlnet_test.py
@@ -17,13 +17,13 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 
-from transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
+from transformers.tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 
 from .tokenization_tests_commons import CommonTestCases
 from .utils import slow
 
-SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                    'fixtures/test_sentencepiece.model')
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+
 
 class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
@@ -40,55 +40,135 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
         return XLNetTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"This is a test"
-        output_text = u"This is a test"
+        input_text = "This is a test"
+        output_text = "This is a test"
         return input_text, output_text
 
-
     def test_full_tokenizer(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
 
-        tokens = tokenizer.tokenize(u'This is a test')
-        self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
 
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
         self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
-
-        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
-                                    u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
-                                    u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                    SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "é",
+                ".",
+            ],
+        )
         ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(
-            ids, [8, 21, 84, 55, 24, 19, 7, 0,
-                602, 347, 347, 347, 3, 12, 66,
-                46, 72, 80, 6, 0, 4])
+        self.assertListEqual(ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4])
 
         back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
-                                        u'or', u'n', SPIECE_UNDERLINE + u'in',
-                                        SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
-                                        SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                        SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
-                                        u'<unk>', u'.'])
+        self.assertListEqual(
+            back_tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "<unk>",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "<unk>",
+                ".",
+            ],
+        )
 
     def test_tokenizer_lower(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
-        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'', u'i', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
-                                      u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
-                                      u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                      SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
-        self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), [u"▁he", u"ll", u"o"])
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "",
+                "i",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "se",
+                ".",
+            ],
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["▁he", "ll", "o"])
 
     def test_tokenizer_no_lower(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False)
-        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or',
-                                      u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
-                                      u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                      SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "se",
+                ".",
+            ],
+        )
 
     @slow
     def test_sequence_builders(self):
@@ -104,5 +184,5 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
         assert encoded_pair == text + [4] + text_2 + [4, 3]
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/utils.py b/transformers/tests/utils.py
index ba0e19f420..aab5e5a8a5 100644
--- a/transformers/tests/utils.py
+++ b/transformers/tests/utils.py
@@ -27,6 +27,7 @@ def parse_flag_from_env(key, default=False):
             raise ValueError("If set, {} must be yes or no.".format(key))
     return _value
 
+
 _run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
 _run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False)
 
diff --git a/transformers/tokenization_albert.py b/transformers/tokenization_albert.py
index 6b92d07218..b03b3ca119 100644
--- a/transformers/tokenization_albert.py
+++ b/transformers/tokenization_albert.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Tokenization classes for ALBERT model."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 from .tokenization_utils import PreTrainedTokenizer
 import logging
@@ -24,34 +23,34 @@ import os
 from shutil import copyfile
 
 logger = logging.getLogger(__name__)
-VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-spiece.model",
-        'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-spiece.model",
-        'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-spiece.model",
-        'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-spiece.model",
-        'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model",
-        'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-spiece.model",
-        'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-spiece.model",
-        'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-spiece.model",
+    "vocab_file": {
+        "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-spiece.model",
+        "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-spiece.model",
+        "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-spiece.model",
+        "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-spiece.model",
+        "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model",
+        "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-spiece.model",
+        "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-spiece.model",
+        "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-spiece.model",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'albert-base-v1': 512,
-    'albert-large-v1': 512,
-    'albert-xlarge-v1': 512,
-    'albert-xxlarge-v1': 512,
-    'albert-base-v2': 512,
-    'albert-large-v2': 512,
-    'albert-xlarge-v2': 512,
-    'albert-xxlarge-v2': 512,
+    "albert-base-v1": 512,
+    "albert-large-v1": 512,
+    "albert-xlarge-v1": 512,
+    "albert-xxlarge-v1": 512,
+    "albert-base-v2": 512,
+    "albert-large-v2": 512,
+    "albert-xlarge-v2": 512,
+    "albert-xxlarge-v2": 512,
 }
 
-SPIECE_UNDERLINE = u'▁'
+SPIECE_UNDERLINE = "▁"
+
 
 class AlbertTokenizer(PreTrainedTokenizer):
     """
@@ -59,18 +58,36 @@ class AlbertTokenizer(PreTrainedTokenizer):
 
             - requires `SentencePiece <https://github.com/google/sentencepiece>`_
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file,
-                 do_lower_case=True, remove_space=True, keep_accents=False,
-                 bos_token="[CLS]", eos_token="[SEP]", unk_token="<unk>", sep_token="[SEP]",
-                 pad_token="<pad>", cls_token="[CLS]", mask_token="[MASK]", **kwargs):
-        super(AlbertTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
-                                             unk_token=unk_token, sep_token=sep_token,
-                                             pad_token=pad_token, cls_token=cls_token,
-                                             mask_token=mask_token, **kwargs)
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        remove_space=True,
+        keep_accents=False,
+        bos_token="[CLS]",
+        eos_token="[SEP]",
+        unk_token="<unk>",
+        sep_token="[SEP]",
+        pad_token="<pad>",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        super(AlbertTokenizer, self).__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs
+        )
 
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
@@ -78,8 +95,10 @@ class AlbertTokenizer(PreTrainedTokenizer):
         try:
             import sentencepiece as spm
         except ImportError:
-            logger.warning("You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
-                           "pip install sentencepiece")
+            logger.warning(
+                "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
+                "pip install sentencepiece"
+            )
 
         self.do_lower_case = do_lower_case
         self.remove_space = remove_space
@@ -103,24 +122,26 @@ class AlbertTokenizer(PreTrainedTokenizer):
         try:
             import sentencepiece as spm
         except ImportError:
-            logger.warning("You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
-                           "pip install sentencepiece")
+            logger.warning(
+                "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
+                "pip install sentencepiece"
+            )
         self.sp_model = spm.SentencePieceProcessor()
         self.sp_model.Load(self.vocab_file)
 
     def preprocess_text(self, inputs):
         if self.remove_space:
-            outputs = ' '.join(inputs.strip().split())
+            outputs = " ".join(inputs.strip().split())
         else:
             outputs = inputs
         outputs = outputs.replace("``", '"').replace("''", '"')
 
         if six.PY2 and isinstance(outputs, str):
-            outputs = outputs.decode('utf-8')
+            outputs = outputs.decode("utf-8")
 
         if not self.keep_accents:
-            outputs = unicodedata.normalize('NFKD', outputs)
-            outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
+            outputs = unicodedata.normalize("NFKD", outputs)
+            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
         if self.do_lower_case:
             outputs = outputs.lower()
 
@@ -133,7 +154,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
         text = self.preprocess_text(text)
         # note(zhiliny): in some systems, sentencepiece only accepts str for py2
         if six.PY2 and isinstance(text, unicode):
-            text = text.encode('utf-8')
+            text = text.encode("utf-8")
 
         if not sample:
             pieces = self.sp_model.EncodeAsPieces(text)
@@ -141,9 +162,8 @@ class AlbertTokenizer(PreTrainedTokenizer):
             pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
         new_pieces = []
         for piece in pieces:
-            if len(piece) > 1 and piece[-1] == str(',') and piece[-2].isdigit():
-                cur_pieces = self.sp_model.EncodeAsPieces(
-                    piece[:-1].replace(SPIECE_UNDERLINE, ''))
+            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
+                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
                 if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                     if len(cur_pieces[0]) == 1:
                         cur_pieces = cur_pieces[1:]
@@ -159,7 +179,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
             ret_pieces = []
             for piece in new_pieces:
                 if isinstance(piece, str):
-                    piece = piece.decode('utf-8')
+                    piece = piece.decode("utf-8")
                 ret_pieces.append(piece)
             new_pieces = ret_pieces
 
@@ -173,12 +193,12 @@ class AlbertTokenizer(PreTrainedTokenizer):
         """Converts an index (integer) in a token (string/unicode) using the vocab."""
         token = self.sp_model.IdToPiece(index)
         if six.PY2 and return_unicode and isinstance(token, str):
-            token = token.decode('utf-8')
+            token = token.decode("utf-8")
         return token
 
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
         return out_string
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
@@ -213,8 +233,10 @@ class AlbertTokenizer(PreTrainedTokenizer):
 
         if already_has_special_tokens:
             if token_ids_1 is not None:
-                raise ValueError("You should not supply a second sequence if the provided sequence of "
-                                 "ids is already formated with special tokens for the model.")
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
         if token_ids_1 is not None:
@@ -244,7 +266,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
 
         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py
index 5377bd48cb..5d36fdcbaf 100644
--- a/transformers/tokenization_auto.py
+++ b/transformers/tokenization_auto.py
@@ -35,6 +35,7 @@ from .tokenization_xlm_roberta import XLMRobertaTokenizer
 
 logger = logging.getLogger(__name__)
 
+
 class AutoTokenizer(object):
     r""":class:`~transformers.AutoTokenizer` is a generic tokenizer class
         that will be instantiated as one of the tokenizer classes of the library
@@ -62,9 +63,12 @@ class AutoTokenizer(object):
 
         This class cannot be instantiated using `__init__()` (throw an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("AutoTokenizer is designed to be instantiated "
-            "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.")
+        raise EnvironmentError(
+            "AutoTokenizer is designed to be instantiated "
+            "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method."
+        )
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
@@ -125,34 +129,38 @@ class AutoTokenizer(object):
             tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
 
         """
-        if 't5' in pretrained_model_name_or_path:
+        if "t5" in pretrained_model_name_or_path:
             return T5Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
+        elif "distilbert" in pretrained_model_name_or_path:
             return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
             return AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'camembert' in pretrained_model_name_or_path:
+        elif "camembert" in pretrained_model_name_or_path:
             return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'xlm-roberta' in pretrained_model_name_or_path:
+        elif "xlm-roberta" in pretrained_model_name_or_path:
             return XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        elif "roberta" in pretrained_model_name_or_path:
             return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'bert-base-japanese' in pretrained_model_name_or_path:
+        elif "bert-base-japanese" in pretrained_model_name_or_path:
             return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
             return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'openai-gpt' in pretrained_model_name_or_path:
+        elif "openai-gpt" in pretrained_model_name_or_path:
             return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'gpt2' in pretrained_model_name_or_path:
+        elif "gpt2" in pretrained_model_name_or_path:
             return GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'transfo-xl' in pretrained_model_name_or_path:
+        elif "transfo-xl" in pretrained_model_name_or_path:
             return TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
             return XLMTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'ctrl' in pretrained_model_name_or_path:
+        elif "ctrl" in pretrained_model_name_or_path:
             return CTRLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm-roberta', 'xlm', 'roberta', 'distilbert,' 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+            "'xlm-roberta', 'xlm', 'roberta', 'distilbert,' 'camembert', 'ctrl', 'albert'".format(
+                pretrained_model_name_or_path
+            )
+        )
diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py
index edc26d88cf..7b3705cc19 100644
--- a/transformers/tokenization_bert.py
+++ b/transformers/tokenization_bert.py
@@ -26,69 +26,68 @@ from .tokenization_utils import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
 
-VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-        'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
-        'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
-        'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
-        'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
-        'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
-        'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
-        'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
-        'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
-        'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
-        'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
-        'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
-        'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
-        'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
-        'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
-        'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
-        'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt",
+    "vocab_file": {
+        "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+        "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+        "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+        "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+        "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+        "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+        "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+        "bert-base-german-cased": "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
+        "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
+        "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
+        "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
+        "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
+        "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
+        "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
+        "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
+        "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
+        "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'bert-base-uncased': 512,
-    'bert-large-uncased': 512,
-    'bert-base-cased': 512,
-    'bert-large-cased': 512,
-    'bert-base-multilingual-uncased': 512,
-    'bert-base-multilingual-cased': 512,
-    'bert-base-chinese': 512,
-    'bert-base-german-cased': 512,
-    'bert-large-uncased-whole-word-masking': 512,
-    'bert-large-cased-whole-word-masking': 512,
-    'bert-large-uncased-whole-word-masking-finetuned-squad': 512,
-    'bert-large-cased-whole-word-masking-finetuned-squad': 512,
-    'bert-base-cased-finetuned-mrpc': 512,
-    'bert-base-german-dbmdz-cased': 512,
-    'bert-base-german-dbmdz-uncased': 512,
-    'bert-base-finnish-cased-v1': 512,
-    'bert-base-finnish-uncased-v1': 512,
+    "bert-base-uncased": 512,
+    "bert-large-uncased": 512,
+    "bert-base-cased": 512,
+    "bert-large-cased": 512,
+    "bert-base-multilingual-uncased": 512,
+    "bert-base-multilingual-cased": 512,
+    "bert-base-chinese": 512,
+    "bert-base-german-cased": 512,
+    "bert-large-uncased-whole-word-masking": 512,
+    "bert-large-cased-whole-word-masking": 512,
+    "bert-large-uncased-whole-word-masking-finetuned-squad": 512,
+    "bert-large-cased-whole-word-masking-finetuned-squad": 512,
+    "bert-base-cased-finetuned-mrpc": 512,
+    "bert-base-german-dbmdz-cased": 512,
+    "bert-base-german-dbmdz-uncased": 512,
+    "bert-base-finnish-cased-v1": 512,
+    "bert-base-finnish-uncased-v1": 512,
 }
 
 PRETRAINED_INIT_CONFIGURATION = {
-    'bert-base-uncased': {'do_lower_case': True},
-    'bert-large-uncased': {'do_lower_case': True},
-    'bert-base-cased': {'do_lower_case': False},
-    'bert-large-cased': {'do_lower_case': False},
-    'bert-base-multilingual-uncased': {'do_lower_case': True},
-    'bert-base-multilingual-cased': {'do_lower_case': False},
-    'bert-base-chinese': {'do_lower_case': False},
-    'bert-base-german-cased': {'do_lower_case': False},
-    'bert-large-uncased-whole-word-masking': {'do_lower_case': True},
-    'bert-large-cased-whole-word-masking': {'do_lower_case': False},
-    'bert-large-uncased-whole-word-masking-finetuned-squad': {'do_lower_case': True},
-    'bert-large-cased-whole-word-masking-finetuned-squad': {'do_lower_case': False},
-    'bert-base-cased-finetuned-mrpc': {'do_lower_case': False},
-    'bert-base-german-dbmdz-cased': {'do_lower_case': False},
-    'bert-base-german-dbmdz-uncased': {'do_lower_case': True},
-    'bert-base-finnish-cased-v1': {'do_lower_case': False},
-    'bert-base-finnish-uncased-v1': {'do_lower_case': True},
+    "bert-base-uncased": {"do_lower_case": True},
+    "bert-large-uncased": {"do_lower_case": True},
+    "bert-base-cased": {"do_lower_case": False},
+    "bert-large-cased": {"do_lower_case": False},
+    "bert-base-multilingual-uncased": {"do_lower_case": True},
+    "bert-base-multilingual-cased": {"do_lower_case": False},
+    "bert-base-chinese": {"do_lower_case": False},
+    "bert-base-german-cased": {"do_lower_case": False},
+    "bert-large-uncased-whole-word-masking": {"do_lower_case": True},
+    "bert-large-cased-whole-word-masking": {"do_lower_case": False},
+    "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
+    "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
+    "bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
+    "bert-base-german-dbmdz-cased": {"do_lower_case": False},
+    "bert-base-german-dbmdz-uncased": {"do_lower_case": True},
+    "bert-base-finnish-cased-v1": {"do_lower_case": False},
+    "bert-base-finnish-uncased-v1": {"do_lower_case": True},
 }
 
 
@@ -98,7 +97,7 @@ def load_vocab(vocab_file):
     with open(vocab_file, "r", encoding="utf-8") as reader:
         tokens = reader.readlines()
     for index, token in enumerate(tokens):
-        token = token.rstrip('\n')
+        token = token.rstrip("\n")
         vocab[token] = index
     return vocab
 
@@ -132,9 +131,20 @@ class BertTokenizer(PreTrainedTokenizer):
     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
-                 unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
-                 mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs):
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        **kwargs
+    ):
         """Constructs a BertTokenizer.
 
         Args:
@@ -152,24 +162,29 @@ class BertTokenizer(PreTrainedTokenizer):
                 This should likely be deactivated for Japanese:
                 see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
         """
-        super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
-                                            pad_token=pad_token, cls_token=cls_token,
-                                            mask_token=mask_token, **kwargs)
+        super(BertTokenizer, self).__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs
+        )
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
 
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
         self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict(
-            [(ids, tok) for tok, ids in self.vocab.items()])
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
         self.do_basic_tokenize = do_basic_tokenize
         if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
-                                                  never_split=never_split,
-                                                  tokenize_chinese_chars=tokenize_chinese_chars)
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars
+            )
         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
 
     @property
@@ -196,7 +211,7 @@ class BertTokenizer(PreTrainedTokenizer):
 
     def convert_tokens_to_string(self, tokens):
         """ Converts a sequence of tokens (string) in a single string. """
-        out_string = ' '.join(tokens).replace(' ##', '').strip()
+        out_string = " ".join(tokens).replace(" ##", "").strip()
         return out_string
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
@@ -231,8 +246,10 @@ class BertTokenizer(PreTrainedTokenizer):
 
         if already_has_special_tokens:
             if token_ids_1 is not None:
-                raise ValueError("You should not supply a second sequence if the provided sequence of "
-                                 "ids is already formated with special tokens for the model.")
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
         if token_ids_1 is not None:
@@ -258,16 +275,18 @@ class BertTokenizer(PreTrainedTokenizer):
         """Save the tokenizer vocabulary to a directory or file."""
         index = 0
         if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
         else:
             vocab_file = vocab_path
         with open(vocab_file, "w", encoding="utf-8") as writer:
             for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                 if index != token_index:
-                    logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
-                                   " Please check that the vocabulary is not corrupted!".format(vocab_file))
+                    logger.warning(
+                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
+                    )
                     index = token_index
-                writer.write(token + u'\n')
+                writer.write(token + "\n")
                 index += 1
         return (vocab_file,)
 
@@ -382,14 +401,16 @@ class BasicTokenizer(object):
         # as is Japanese Hiragana and Katakana. Those alphabets are used to write
         # space-separated words, so they are not treated specially and handled
         # like the all of the other languages.
-        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-                (cp >= 0x3400 and cp <= 0x4DBF) or  #
-                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-                (cp >= 0x2B820 and cp <= 0x2CEAF) or
-                (cp >= 0xF900 and cp <= 0xFAFF) or  #
-                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
             return True
 
         return False
@@ -399,7 +420,7 @@ class BasicTokenizer(object):
         output = []
         for char in text:
             cp = ord(char)
-            if cp == 0 or cp == 0xfffd or _is_control(char):
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
                 continue
             if _is_whitespace(char):
                 output.append(" ")
@@ -499,8 +520,7 @@ def _is_punctuation(char):
     # Characters such as "^", "$", and "`" are not in the Unicode
     # Punctuation class but we treat them as punctuation anyways, for
     # consistency.
-    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
-            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
         return True
     cat = unicodedata.category(char)
     if cat.startswith("P"):
diff --git a/transformers/tokenization_bert_japanese.py b/transformers/tokenization_bert_japanese.py
index 0ff45cbfe7..48b9b04b4c 100644
--- a/transformers/tokenization_bert_japanese.py
+++ b/transformers/tokenization_bert_japanese.py
@@ -28,46 +28,45 @@ from .tokenization_utils import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
 
-VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt",
-        'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt",
-        'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt",
-        'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt"
+    "vocab_file": {
+        "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt",
+        "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt",
+        "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt",
+        "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'bert-base-japanese': 512,
-    'bert-base-japanese-whole-word-masking': 512,
-    'bert-base-japanese-char': 512,
-    'bert-base-japanese-char-whole-word-masking': 512
+    "bert-base-japanese": 512,
+    "bert-base-japanese-whole-word-masking": 512,
+    "bert-base-japanese-char": 512,
+    "bert-base-japanese-char-whole-word-masking": 512,
 }
 
 PRETRAINED_INIT_CONFIGURATION = {
-    'bert-base-japanese': {
-        'do_lower_case': False,
-        'word_tokenizer_type': 'mecab',
-        'subword_tokenizer_type': 'wordpiece'
+    "bert-base-japanese": {
+        "do_lower_case": False,
+        "word_tokenizer_type": "mecab",
+        "subword_tokenizer_type": "wordpiece",
     },
-    'bert-base-japanese-whole-word-masking':{
-        'do_lower_case': False,
-        'word_tokenizer_type': 'mecab',
-        'subword_tokenizer_type': 'wordpiece'
+    "bert-base-japanese-whole-word-masking": {
+        "do_lower_case": False,
+        "word_tokenizer_type": "mecab",
+        "subword_tokenizer_type": "wordpiece",
     },
-    'bert-base-japanese-char': {
-        'do_lower_case': False,
-        'word_tokenizer_type': 'mecab',
-        'subword_tokenizer_type': 'character'
+    "bert-base-japanese-char": {
+        "do_lower_case": False,
+        "word_tokenizer_type": "mecab",
+        "subword_tokenizer_type": "character",
+    },
+    "bert-base-japanese-char-whole-word-masking": {
+        "do_lower_case": False,
+        "word_tokenizer_type": "mecab",
+        "subword_tokenizer_type": "character",
     },
-    'bert-base-japanese-char-whole-word-masking': {
-        'do_lower_case': False,
-        'word_tokenizer_type': 'mecab',
-        'subword_tokenizer_type': 'character'
-    }
 }
 
 
@@ -79,11 +78,22 @@ class BertJapaneseTokenizer(BertTokenizer):
     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, do_lower_case=False,
-                 do_word_tokenize=True, do_subword_tokenize=True,
-                 word_tokenizer_type='basic', subword_tokenizer_type='wordpiece',
-                 never_split=None, unk_token='[UNK]', sep_token='[SEP]',
-                 pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs):
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=False,
+        do_word_tokenize=True,
+        do_subword_tokenize=True,
+        word_tokenizer_type="basic",
+        subword_tokenizer_type="wordpiece",
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
         """Constructs a MecabBertTokenizer.
 
         Args:
@@ -100,56 +110,53 @@ class BertJapaneseTokenizer(BertTokenizer):
             **subword_tokenizer_type**: (`optional`) string (default "wordpiece")
                 Type of subword tokenizer.
         """
-        super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
-                                            pad_token=pad_token, cls_token=cls_token,
-                                            mask_token=mask_token, **kwargs)
+        super(BertTokenizer, self).__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs
+        )
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
 
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
         self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict(
-            [(ids, tok) for tok, ids in self.vocab.items()])
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
 
         self.do_word_tokenize = do_word_tokenize
         if do_word_tokenize:
-            if word_tokenizer_type == 'basic':
-                self.word_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
-                                                     never_split=never_split,
-                                                     tokenize_chinese_chars=False)
-            elif word_tokenizer_type == 'mecab':
-                self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case,
-                                                     never_split=never_split)
+            if word_tokenizer_type == "basic":
+                self.word_tokenizer = BasicTokenizer(
+                    do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=False
+                )
+            elif word_tokenizer_type == "mecab":
+                self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case, never_split=never_split)
             else:
-                raise ValueError(
-                    "Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))
+                raise ValueError("Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))
 
         self.do_subword_tokenize = do_subword_tokenize
         if do_subword_tokenize:
-            if subword_tokenizer_type == 'wordpiece':
-                self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab,
-                                                            unk_token=self.unk_token)
-            elif subword_tokenizer_type == 'character':
-                self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab,
-                                                            unk_token=self.unk_token)
+            if subword_tokenizer_type == "wordpiece":
+                self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+            elif subword_tokenizer_type == "character":
+                self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token)
             else:
-                raise ValueError(
-                    "Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type))
-
+                raise ValueError("Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type))
 
     def _tokenize(self, text):
         if self.do_word_tokenize:
-            tokens = self.word_tokenizer.tokenize(text,
-                                                  never_split=self.all_special_tokens)
+            tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens)
         else:
             tokens = [text]
 
         if self.do_subword_tokenize:
-            split_tokens = [sub_token for token in tokens
-                            for sub_token in self.subword_tokenizer.tokenize(token)]
+            split_tokens = [sub_token for token in tokens for sub_token in self.subword_tokenizer.tokenize(token)]
         else:
             split_tokens = tokens
 
@@ -177,27 +184,28 @@ class MecabTokenizer(object):
         self.normalize_text = normalize_text
 
         import MeCab
+
         self.mecab = MeCab.Tagger()
 
     def tokenize(self, text, never_split=None, **kwargs):
         """Tokenizes a piece of text."""
         if self.normalize_text:
-            text = unicodedata.normalize('NFKC', text)
+            text = unicodedata.normalize("NFKC", text)
 
         never_split = self.never_split + (never_split if never_split is not None else [])
         tokens = []
 
         if six.PY2:
-            mecab_output = self.mecab.parse(text.encode('utf-8')).decode('utf-8')
+            mecab_output = self.mecab.parse(text.encode("utf-8")).decode("utf-8")
         else:
             mecab_output = self.mecab.parse(text)
 
         cursor = 0
-        for line in mecab_output.split('\n'):
-            if line == 'EOS':
+        for line in mecab_output.split("\n"):
+            if line == "EOS":
                 break
 
-            token, _ = line.split('\t')
+            token, _ = line.split("\t")
             token_start = text.index(token, cursor)
             token_end = token_start + len(token)
             if self.do_lower_case and token not in never_split:
@@ -240,7 +248,7 @@ class CharacterTokenizer(object):
             A list of characters.
         """
         if self.normalize_text:
-            text = unicodedata.normalize('NFKC', text)
+            text = unicodedata.normalize("NFKC", text)
 
         output_tokens = []
         for i, char in enumerate(text):
diff --git a/transformers/tokenization_camembert.py b/transformers/tokenization_camembert.py
index 4c4615eb3d..c1e80e0e05 100644
--- a/transformers/tokenization_camembert.py
+++ b/transformers/tokenization_camembert.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 """ Tokenization classes for Camembert model."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 import os
@@ -26,19 +25,19 @@ from .tokenization_xlnet import SPIECE_UNDERLINE
 
 logger = logging.getLogger(__name__)
 
-VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'}
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-    'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model",
+    "vocab_file": {
+        "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'camembert-base': None,
+    "camembert-base": None,
 }
 
+
 class CamembertTokenizer(PreTrainedTokenizer):
     """
         Adapted from RobertaTokenizer and XLNetTokenizer
@@ -46,17 +45,36 @@ class CamembertTokenizer(PreTrainedTokenizer):
 
             - requires `SentencePiece <https://github.com/google/sentencepiece>`_
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, bos_token="<s>", eos_token="</s>", sep_token="</s>",
-                 cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>',
-                 additional_special_tokens=['<s>NOTUSED', '</s>NOTUSED'], **kwargs):
-        super(CamembertTokenizer, self).__init__(max_len=512, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
-                                                 sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
-                                                 mask_token=mask_token, additional_special_tokens=additional_special_tokens,
-                                                 **kwargs)
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
+        **kwargs
+    ):
+        super(CamembertTokenizer, self).__init__(
+            max_len=512,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs
+        )
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
         self.sp_model = spm.SentencePieceProcessor()
@@ -64,9 +82,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
         self.vocab_file = vocab_file
         # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
         # sentencepiece vocabulary (this is the case for <s> and </s>
-        self.fairseq_tokens_to_ids = {'<s>NOTUSED': 0, '<pad>': 1, '</s>NOTUSED': 2, '<unk>': 3}
+        self.fairseq_tokens_to_ids = {"<s>NOTUSED": 0, "<pad>": 1, "</s>NOTUSED": 2, "<unk>": 3}
         self.fairseq_offset = len(self.fairseq_tokens_to_ids)
-        self.fairseq_tokens_to_ids['<mask>'] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
+        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
         self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
@@ -100,8 +118,10 @@ class CamembertTokenizer(PreTrainedTokenizer):
         """
         if already_has_special_tokens:
             if token_ids_1 is not None:
-                raise ValueError("You should not supply a second sequence if the provided sequence of "
-                                 "ids is already formated with special tokens for the model.")
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
         if token_ids_1 is None:
@@ -148,7 +168,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
 
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
         return out_string
 
     def save_vocabulary(self, save_directory):
@@ -158,7 +178,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
 
         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
diff --git a/transformers/tokenization_ctrl.py b/transformers/tokenization_ctrl.py
index 219f17c404..2ce2bbf094 100644
--- a/transformers/tokenization_ctrl.py
+++ b/transformers/tokenization_ctrl.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for Salesforce CTRL."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import json
 import logging
@@ -27,23 +26,17 @@ from .tokenization_utils import PreTrainedTokenizer
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {
-    'vocab_file': 'vocab.json',
-    'merges_file': 'merges.txt',
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'ctrl': "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json",
-    },
-    'merges_file':
-    {
-        'ctrl': "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt",
-    },
+    "vocab_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json",},
+    "merges_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt",},
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'ctrl': 256,
+    "ctrl": 256,
 }
 
 CONTROL_CODES = {
@@ -104,6 +97,7 @@ CONTROL_CODES = {
     "multilingual": 128406,
 }
 
+
 def get_pairs(word):
     """Return set of symbol pairs in a word.
 
@@ -118,11 +112,13 @@ def get_pairs(word):
     pairs = set(pairs)
     return pairs
 
+
 class CTRLTokenizer(PreTrainedTokenizer):
     """
     CTRL BPE tokenizer. Peculiarities:
         - Byte-Pair-Encoding
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@@ -130,14 +126,18 @@ class CTRLTokenizer(PreTrainedTokenizer):
 
     def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
         super(CTRLTokenizer, self).__init__(unk_token=unk_token, **kwargs)
-        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
-        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
+        self.max_len_single_sentence = (
+            self.max_len
+        )  # no default special tokens - you can update this value if you add special tokens
+        self.max_len_sentences_pair = (
+            self.max_len
+        )  # no default special tokens - you can update this value if you add special tokens
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
-        self.decoder = {v:k for k,v in self.encoder.items()}
-        with open(merges_file, encoding='utf-8') as merges_handle:
-            merges = merges_handle.read().split('\n')[1:-1]
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            merges = merges_handle.read().split("\n")[1:-1]
         merges = [tuple(merge.split()) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
@@ -150,14 +150,14 @@ class CTRLTokenizer(PreTrainedTokenizer):
         if token in self.cache:
             return self.cache[token]
         word = tuple(token)
-        word = tuple(list(word[:-1]) + [word[-1]+'</w>'])
+        word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
         pairs = get_pairs(word)
 
         if not pairs:
             return token
 
         while True:
-            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
             if bigram not in self.bpe_ranks:
                 break
             first, second = bigram
@@ -172,8 +172,8 @@ class CTRLTokenizer(PreTrainedTokenizer):
                     new_word.extend(word[i:])
                     break
 
-                if word[i] == first and i < len(word)-1 and word[i+1] == second:
-                    new_word.append(first+second)
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
                     i += 2
                 else:
                     new_word.append(word[i])
@@ -184,7 +184,7 @@ class CTRLTokenizer(PreTrainedTokenizer):
                 break
             else:
                 pairs = get_pairs(word)
-        word = '@@ '.join(word)
+        word = "@@ ".join(word)
         word = word[:-4]
         self.cache[token] = word
         return word
@@ -194,10 +194,10 @@ class CTRLTokenizer(PreTrainedTokenizer):
         """
         split_tokens = []
 
-        words = re.findall(r'\S+\n?', text)
+        words = re.findall(r"\S+\n?", text)
 
         for token in words:
-            split_tokens.extend([t for t in self.bpe(token).split(' ')])
+            split_tokens.extend([t for t in self.bpe(token).split(" ")])
         return split_tokens
 
     def _convert_token_to_id(self, token):
@@ -210,7 +210,7 @@ class CTRLTokenizer(PreTrainedTokenizer):
 
     def convert_tokens_to_string(self, tokens):
         """ Converts a sequence of tokens (string) in a single string. """
-        out_string = ' '.join(tokens).replace('@@ ', '').strip()
+        out_string = " ".join(tokens).replace("@@ ", "").strip()
         return out_string
 
     def save_vocabulary(self, save_directory):
@@ -218,21 +218,23 @@ class CTRLTokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
-        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
 
-        with open(vocab_file, 'w', encoding='utf-8') as f:
+        with open(vocab_file, "w", encoding="utf-8") as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
 
         index = 0
         with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write(u'#version: 0.2\n')
+            writer.write("#version: 0.2\n")
             for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                 if index != token_index:
-                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    logger.warning(
+                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!".format(merge_file)
+                    )
                     index = token_index
-                writer.write(' '.join(bpe_tokens) + u'\n')
+                writer.write(" ".join(bpe_tokens) + "\n")
                 index += 1
 
         return vocab_file, merge_file
diff --git a/transformers/tokenization_distilbert.py b/transformers/tokenization_distilbert.py
index 2f245d71dc..7fed1e4058 100644
--- a/transformers/tokenization_distilbert.py
+++ b/transformers/tokenization_distilbert.py
@@ -26,23 +26,22 @@ from .tokenization_bert import BertTokenizer
 
 logger = logging.getLogger(__name__)
 
-VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-        'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
-        'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt",
-        'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+    "vocab_file": {
+        "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+        "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+        "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt",
+        "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'distilbert-base-uncased': 512,
-    'distilbert-base-uncased-distilled-squad': 512,
-    'distilbert-base-german-cased': 512,
-    'distilbert-base-multilingual-cased': 512,
+    "distilbert-base-uncased": 512,
+    "distilbert-base-uncased-distilled-squad": 512,
+    "distilbert-base-german-cased": 512,
+    "distilbert-base-multilingual-cased": 512,
 }
 
 
diff --git a/transformers/tokenization_gpt2.py b/transformers/tokenization_gpt2.py
index 68c6101860..b6a0e7b78b 100644
--- a/transformers/tokenization_gpt2.py
+++ b/transformers/tokenization_gpt2.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import sys
 import json
@@ -31,42 +30,42 @@ except ImportError:
     def lru_cache():
         return lambda func: func
 
+
 from .tokenization_utils import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {
-    'vocab_file': 'vocab.json',
-    'merges_file': 'merges.txt',
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
-        'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
-        'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
-        'gpt2-xl': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.json",
-        'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json",
+    "vocab_file": {
+        "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+        "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
+        "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
+        "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.json",
+        "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json",
     },
-    'merges_file':
-    {
-        'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
-        'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
-        'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
-        'gpt2-xl': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txt",
-        'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt",
+    "merges_file": {
+        "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+        "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
+        "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
+        "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txt",
+        "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt",
     },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'gpt2': 1024,
-    'gpt2-medium': 1024,
-    'gpt2-large': 1024,
-    'gpt2-xl': 1024,
-    'distilgpt2': 1024,
+    "gpt2": 1024,
+    "gpt2-medium": 1024,
+    "gpt2-large": 1024,
+    "gpt2-xl": 1024,
+    "distilgpt2": 1024,
 }
 
+
 @lru_cache()
 def bytes_to_unicode():
     """
@@ -80,17 +79,20 @@ def bytes_to_unicode():
     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
     """
     _chr = unichr if sys.version_info[0] == 2 else chr
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
     cs = bs[:]
     n = 0
-    for b in range(2**8):
+    for b in range(2 ** 8):
         if b not in bs:
             bs.append(b)
-            cs.append(2**8+n)
+            cs.append(2 ** 8 + n)
             n += 1
     cs = [_chr(n) for n in cs]
     return dict(zip(bs, cs))
 
+
 def get_pairs(word):
     """Return set of symbol pairs in a word.
 
@@ -103,6 +105,7 @@ def get_pairs(word):
         prev_char = char
     return pairs
 
+
 class GPT2Tokenizer(PreTrainedTokenizer):
     """
     GPT-2 BPE tokenizer. Peculiarities:
@@ -112,15 +115,28 @@ class GPT2Tokenizer(PreTrainedTokenizer):
           Otherwise, this tokenizer's ``encode``, ``decode``, and ``tokenize`` methods will not conserve
           the spaces at the beginning of a string: `tokenizer.decode(tokenizer.encode(" Hello")) = "Hello"`
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
-                 bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        **kwargs
+    ):
         super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
-        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
-        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
+        self.max_len_single_sentence = (
+            self.max_len
+        )  # no default special tokens - you can update this value if you add special tokens
+        self.max_len_sentences_pair = (
+            self.max_len
+        )  # no default special tokens - you can update this value if you add special tokens
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
@@ -128,8 +144,8 @@ class GPT2Tokenizer(PreTrainedTokenizer):
         self.errors = errors  # how to handle errors in decoding
         self.byte_encoder = bytes_to_unicode()
         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        with open(merges_file, encoding='utf-8') as merges_handle:
-            bpe_merges = merges_handle.read().split('\n')[1:-1]
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_merges = merges_handle.read().split("\n")[1:-1]
         bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
         self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
         self.cache = {}
@@ -151,7 +167,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
             return token
 
         while True:
-            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
             if bigram not in self.bpe_ranks:
                 break
             first, second = bigram
@@ -166,8 +182,8 @@ class GPT2Tokenizer(PreTrainedTokenizer):
                     new_word.extend(word[i:])
                     break
 
-                if word[i] == first and i < len(word)-1 and word[i+1] == second:
-                    new_word.append(first+second)
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
                     i += 2
                 else:
                     new_word.append(word[i])
@@ -178,7 +194,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
                 break
             else:
                 pairs = get_pairs(word)
-        word = ' '.join(word)
+        word = " ".join(word)
         self.cache[token] = word
         return word
 
@@ -189,15 +205,19 @@ class GPT2Tokenizer(PreTrainedTokenizer):
                     Begin the sentence with at least one space to get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
         """
         if add_prefix_space:
-            text = ' ' + text
+            text = " " + text
 
         bpe_tokens = []
         for token in re.findall(self.pat, text):
             if sys.version_info[0] == 2:
-                token = ''.join(self.byte_encoder[ord(b)] for b in token) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
+                token = "".join(
+                    self.byte_encoder[ord(b)] for b in token
+                )  # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
             else:
-                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
+                token = "".join(
+                    self.byte_encoder[b] for b in token.encode("utf-8")
+                )  # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
         return bpe_tokens
 
     def _convert_token_to_id(self, token):
@@ -210,8 +230,8 @@ class GPT2Tokenizer(PreTrainedTokenizer):
 
     def convert_tokens_to_string(self, tokens):
         """ Converts a sequence of tokens (string) in a single string. """
-        text = ''.join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
         return text
 
     def save_vocabulary(self, save_directory):
@@ -219,21 +239,23 @@ class GPT2Tokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
-        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
 
-        with open(vocab_file, 'w', encoding='utf-8') as f:
+        with open(vocab_file, "w", encoding="utf-8") as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
 
         index = 0
         with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write(u'#version: 0.2\n')
+            writer.write("#version: 0.2\n")
             for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                 if index != token_index:
-                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    logger.warning(
+                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!".format(merge_file)
+                    )
                     index = token_index
-                writer.write(' '.join(bpe_tokens) + u'\n')
+                writer.write(" ".join(bpe_tokens) + "\n")
                 index += 1
 
         return vocab_file, merge_file
diff --git a/transformers/tokenization_openai.py b/transformers/tokenization_openai.py
index a4c64b7020..d8f7549eda 100644
--- a/transformers/tokenization_openai.py
+++ b/transformers/tokenization_openai.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import json
 import logging
@@ -28,25 +27,20 @@ from .tokenization_bert import BasicTokenizer
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {
-    'vocab_file': 'vocab.json',
-    'merges_file': 'merges.txt',
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",
-    },
-    'merges_file':
-    {
-        'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
-    },
+    "vocab_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",},
+    "merges_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",},
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'openai-gpt': 512,
+    "openai-gpt": 512,
 }
 
+
 def get_pairs(word):
     """
     Return set of symbol pairs in a word.
@@ -59,27 +53,30 @@ def get_pairs(word):
         prev_char = char
     return pairs
 
+
 def text_standardize(text):
     """
     fixes some issues the spacy tokenizer had on books corpus
     also does some whitespace standardization
     """
-    text = text.replace('—', '-')
-    text = text.replace('–', '-')
-    text = text.replace('―', '-')
-    text = text.replace('…', '...')
-    text = text.replace('´', "'")
-    text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
-    text = re.sub(r'\s*\n\s*', ' \n ', text)
-    text = re.sub(r'[^\S\n]+', ' ', text)
+    text = text.replace("—", "-")
+    text = text.replace("–", "-")
+    text = text.replace("―", "-")
+    text = text.replace("…", "...")
+    text = text.replace("´", "'")
+    text = re.sub(r"""(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)""", r" \1 ", text)
+    text = re.sub(r"\s*\n\s*", " \n ", text)
+    text = re.sub(r"[^\S\n]+", " ", text)
     return text.strip()
 
+
 class OpenAIGPTTokenizer(PreTrainedTokenizer):
     """
     BPE tokenizer. Peculiarities:
         - lower case all inputs
         - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@@ -87,12 +84,17 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
     def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
         super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)
 
-        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
-        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
+        self.max_len_single_sentence = (
+            self.max_len
+        )  # no default special tokens - you can update this value if you add special tokens
+        self.max_len_sentences_pair = (
+            self.max_len
+        )  # no default special tokens - you can update this value if you add special tokens
 
         try:
             import ftfy
             from spacy.lang.en import English
+
             _nlp = English()
             self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
             self.fix_text = ftfy.fix_text
@@ -103,9 +105,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
-        self.decoder = {v:k for k,v in self.encoder.items()}
-        with open(merges_file, encoding='utf-8') as merges_handle:
-            merges = merges_handle.read().split('\n')[1:-1]
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            merges = merges_handle.read().split("\n")[1:-1]
         merges = [tuple(merge.split()) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
@@ -115,16 +117,16 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
         return len(self.encoder)
 
     def bpe(self, token):
-        word = tuple(token[:-1]) + (token[-1] + '</w>',)
+        word = tuple(token[:-1]) + (token[-1] + "</w>",)
         if token in self.cache:
             return self.cache[token]
         pairs = get_pairs(word)
 
         if not pairs:
-            return token+'</w>'
+            return token + "</w>"
 
         while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
             if bigram not in self.bpe_ranks:
                 break
             first, second = bigram
@@ -139,8 +141,8 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
                     new_word.extend(word[i:])
                     break
 
-                if word[i] == first and i < len(word)-1 and word[i+1] == second:
-                    new_word.append(first+second)
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
                     i += 2
                 else:
                     new_word.append(word[i])
@@ -151,9 +153,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
                 break
             else:
                 pairs = get_pairs(word)
-        word = ' '.join(word)
-        if word == '\n  </w>':
-            word = '\n</w>'
+        word = " ".join(word)
+        if word == "\n  </w>":
+            word = "\n</w>"
         self.cache[token] = word
         return word
 
@@ -164,12 +166,12 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
             # Using BERT's BasicTokenizer
             text = self.nlp.tokenize(text)
             for token in text:
-                split_tokens.extend([t for t in self.bpe(token).split(' ')])
+                split_tokens.extend([t for t in self.bpe(token).split(" ")])
         else:
             # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
             text = self.nlp(text_standardize(self.fix_text(text)))
             for token in text:
-                split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
+                split_tokens.extend([t for t in self.bpe(token.text.lower()).split(" ")])
         return split_tokens
 
     def _convert_token_to_id(self, token):
@@ -182,7 +184,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
 
     def convert_tokens_to_string(self, tokens):
         """ Converts a sequence of tokens (string) in a single string. """
-        out_string = ''.join(tokens).replace('</w>', ' ').strip()
+        out_string = "".join(tokens).replace("</w>", " ").strip()
         return out_string
 
     def save_vocabulary(self, save_directory):
@@ -190,21 +192,23 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
-        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
 
-        with open(vocab_file, 'w', encoding='utf-8') as f:
+        with open(vocab_file, "w", encoding="utf-8") as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
 
         index = 0
         with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write(u'#version: 0.2\n')
+            writer.write("#version: 0.2\n")
             for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                 if index != token_index:
-                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    logger.warning(
+                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!".format(merge_file)
+                    )
                     index = token_index
-                writer.write(' '.join(bpe_tokens) + u'\n')
+                writer.write(" ".join(bpe_tokens) + "\n")
                 index += 1
 
         return vocab_file, merge_file
diff --git a/transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py
index b44e004997..eae8b638fe 100644
--- a/transformers/tokenization_roberta.py
+++ b/transformers/tokenization_roberta.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for RoBERTa."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import sys
 import json
@@ -33,41 +32,40 @@ except ImportError:
     def lru_cache():
         return lambda func: func
 
+
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {
-    'vocab_file': 'vocab.json',
-    'merges_file': 'merges.txt',
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
-        'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
-        'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
-        'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json",
-        'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
-        'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
+    "vocab_file": {
+        "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
+        "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
+        "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
+        "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json",
+        "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
+        "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
     },
-    'merges_file':
-    {
-        'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
-        'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
-        'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
-        'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt",
-        'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
-        'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
+    "merges_file": {
+        "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
+        "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
+        "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
+        "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt",
+        "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
+        "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
     },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'roberta-base': 512,
-    'roberta-large': 512,
-    'roberta-large-mnli': 512,
-    'distilroberta-base': 512,
-    'roberta-base-openai-detector': 512,
-    'roberta-large-openai-detector': 512,
+    "roberta-base": 512,
+    "roberta-large": 512,
+    "roberta-large-mnli": 512,
+    "distilroberta-base": 512,
+    "roberta-base-openai-detector": 512,
+    "roberta-large-openai-detector": 512,
 }
 
 
@@ -80,16 +78,38 @@ class RobertaTokenizer(GPT2Tokenizer):
           Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
           the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
-                 cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
-        super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors,
-                                               bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
-                                               sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
-                                               mask_token=mask_token, **kwargs)
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        **kwargs
+    ):
+        super(RobertaTokenizer, self).__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            **kwargs
+        )
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
 
@@ -124,8 +144,10 @@ class RobertaTokenizer(GPT2Tokenizer):
         """
         if already_has_special_tokens:
             if token_ids_1 is not None:
-                raise ValueError("You should not supply a second sequence if the provided sequence of "
-                                 "ids is already formated with special tokens for the model.")
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
         if token_ids_1 is None:
diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py
index 9fd37b67c0..3b70d40857 100644
--- a/transformers/tokenization_t5.py
+++ b/transformers/tokenization_t5.py
@@ -26,26 +26,25 @@ from .tokenization_utils import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
 
-SPIECE_UNDERLINE = u'▁'
+SPIECE_UNDERLINE = "▁"
 
 ####################################################
 # Mapping from the keyword arguments names of Tokenizer `__init__`
 # to file names for serializing Tokenizer instances
 ####################################################
-VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
 
 ####################################################
 # Mapping from the keyword arguments names of Tokenizer `__init__`
 # to pretrained vocabulary URL for all the model shortcut names.
 ####################################################
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
-        't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
-        't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
-        't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
-        't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+    "vocab_file": {
+        "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
     }
 }
 
@@ -53,13 +52,14 @@ PRETRAINED_VOCAB_FILES_MAP = {
 # Mapping from model shortcut names to max length of inputs
 ####################################################
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    't5-small': 512,
-    't5-base': 512,
-    't5-large': 512,
-    't5-3b': 512,
-    't5-11b': 512,
+    "t5-small": 512,
+    "t5-base": 512,
+    "t5-large": 512,
+    "t5-3b": 512,
+    "t5-11b": 512,
 }
 
+
 class T5Tokenizer(PreTrainedTokenizer):
     """
         SentencePiece based tokenizer. Peculiarities:
@@ -71,28 +71,43 @@ class T5Tokenizer(PreTrainedTokenizer):
                 (like in T5 preprocessing
                 see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, eos_token="</s>", unk_token="<unk>",
-                 pad_token="<pad>", extra_ids=100, additional_special_tokens=None, **kwargs):
+    def __init__(
+        self,
+        vocab_file,
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        extra_ids=100,
+        additional_special_tokens=None,
+        **kwargs
+    ):
         # Add extra_ids to the special token list
         if extra_ids > 0:
             if additional_special_tokens is None:
                 additional_special_tokens = []
-            additional_special_tokens.extend([u"<extra_id_{}>".format(i) for i in range(extra_ids)])
+            additional_special_tokens.extend(["<extra_id_{}>".format(i) for i in range(extra_ids)])
 
-        super(T5Tokenizer, self).__init__(eos_token=eos_token, unk_token=unk_token,
-                                          pad_token=pad_token, additional_special_tokens=additional_special_tokens,
-                                          **kwargs)
+        super(T5Tokenizer, self).__init__(
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs
+        )
 
         try:
             import sentencepiece as spm
         except ImportError:
-            logger.warning("You need to install SentencePiece to use T5Tokenizer:"
-                           "https://github.com/google/sentencepiece"
-                           "pip install sentencepiece")
+            logger.warning(
+                "You need to install SentencePiece to use T5Tokenizer:"
+                "https://github.com/google/sentencepiece"
+                "pip install sentencepiece"
+            )
 
         self.vocab_file = vocab_file
         self._extra_ids = extra_ids
@@ -114,8 +129,10 @@ class T5Tokenizer(PreTrainedTokenizer):
         try:
             import sentencepiece as spm
         except ImportError:
-            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
-                           "pip install sentencepiece")
+            logger.warning(
+                "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
+                "pip install sentencepiece"
+            )
         self.sp_model = spm.SentencePieceProcessor()
         self.sp_model.Load(self.vocab_file)
 
@@ -132,7 +149,7 @@ class T5Tokenizer(PreTrainedTokenizer):
             ret_pieces = []
             for piece in pieces:
                 if isinstance(piece, str):
-                    piece = piece.decode('utf-8')
+                    piece = piece.decode("utf-8")
                 ret_pieces.append(piece)
             pieces = ret_pieces
 
@@ -140,8 +157,8 @@ class T5Tokenizer(PreTrainedTokenizer):
 
     def _convert_token_to_id(self, token):
         """ Converts a token (str/unicode) in an id using the vocab. """
-        if token.startswith(u"<extra_id_"):
-            l = re.match(r'<extra_id_(\d+)>', token)
+        if token.startswith("<extra_id_"):
+            l = re.match(r"<extra_id_(\d+)>", token)
             num = int(l.group(1))
             return self.vocab_size - num - 1
         return self.sp_model.piece_to_id(token)
@@ -151,9 +168,9 @@ class T5Tokenizer(PreTrainedTokenizer):
         if index < self.sp_model.get_piece_size():
             token = self.sp_model.IdToPiece(index)
         else:
-            token = u"<extra_id_{}>".format(self.vocab_size - 1 - index)
+            token = "<extra_id_{}>".format(self.vocab_size - 1 - index)
         if six.PY2 and return_unicode and isinstance(token, str):
-            token = token.decode('utf-8')
+            token = token.decode("utf-8")
         return token
 
     def convert_tokens_to_string(self, tokens):
@@ -168,7 +185,7 @@ class T5Tokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
 
         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
diff --git a/transformers/tokenization_transfo_xl.py b/transformers/tokenization_transfo_xl.py
index 8d5a0ce9d4..b2f59625f9 100644
--- a/transformers/tokenization_transfo_xl.py
+++ b/transformers/tokenization_transfo_xl.py
@@ -16,8 +16,7 @@
 """ Tokenization classes for Transformer XL model.
     Adapted from https://github.com/kimiyoung/transformer-xl.
 """
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import glob
 import logging
@@ -44,42 +43,58 @@ except ImportError:
 
 logger = logging.getLogger(__name__)
 
-VOCAB_FILES_NAMES = {'pretrained_vocab_file': 'vocab.bin', 'vocab_file': 'vocab.txt'}
+VOCAB_FILES_NAMES = {"pretrained_vocab_file": "vocab.bin", "vocab_file": "vocab.txt"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'pretrained_vocab_file':
-    {
-        'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin",
+    "pretrained_vocab_file": {
+        "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'transfo-xl-wt103': None,
+    "transfo-xl-wt103": None,
 }
 
 PRETRAINED_CORPUS_ARCHIVE_MAP = {
-    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin",
+    "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin",
 }
-CORPUS_NAME = 'corpus.bin'
+CORPUS_NAME = "corpus.bin"
+
 
 class TransfoXLTokenizer(PreTrainedTokenizer):
     """
     Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, special=None, min_freq=0, max_size=None, lower_case=False,
-                 delimiter=None, vocab_file=None, pretrained_vocab_file=None,
-                 never_split=None, unk_token="<unk>", eos_token="<eos>",
-                 additional_special_tokens=["<formula>"], **kwargs):
-        super(TransfoXLTokenizer, self).__init__(unk_token=unk_token, eos_token=eos_token,
-                                                 additional_special_tokens=additional_special_tokens,
-                                                 **kwargs)
+    def __init__(
+        self,
+        special=None,
+        min_freq=0,
+        max_size=None,
+        lower_case=False,
+        delimiter=None,
+        vocab_file=None,
+        pretrained_vocab_file=None,
+        never_split=None,
+        unk_token="<unk>",
+        eos_token="<eos>",
+        additional_special_tokens=["<formula>"],
+        **kwargs
+    ):
+        super(TransfoXLTokenizer, self).__init__(
+            unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs
+        )
 
-        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
-        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
+        self.max_len_single_sentence = (
+            self.max_len
+        )  # no default special tokens - you can update this value if you add special tokens
+        self.max_len_sentences_pair = (
+            self.max_len
+        )  # no default special tokens - you can update this value if you add special tokens
 
         if never_split is None:
             never_split = self.all_special_tokens
@@ -106,14 +121,15 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
             self.build_vocab()
 
     def count_file(self, path, verbose=False, add_eos=False):
-        if verbose: logger.info('counting file {} ...'.format(path))
+        if verbose:
+            logger.info("counting file {} ...".format(path))
         assert os.path.exists(path)
 
         sents = []
-        with open(path, 'r', encoding='utf-8') as f:
+        with open(path, "r", encoding="utf-8") as f:
             for idx, line in enumerate(f):
                 if verbose and idx > 0 and idx % 500000 == 0:
-                    logger.info('    line {}'.format(idx))
+                    logger.info("    line {}".format(idx))
                 symbols = self.tokenize(line, add_eos=add_eos)
                 self.counter.update(symbols)
                 sents.append(symbols)
@@ -124,42 +140,42 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
         """
             sents : a list of sentences, each a list of tokenized symbols
         """
-        if verbose: logger.info('counting {} sents ...'.format(len(sents)))
+        if verbose:
+            logger.info("counting {} sents ...".format(len(sents)))
         for idx, symbols in enumerate(sents):
             if verbose and idx > 0 and idx % 500000 == 0:
-                logger.info('    line {}'.format(idx))
+                logger.info("    line {}".format(idx))
             self.counter.update(symbols)
 
     def _build_from_file(self, vocab_file):
         self.idx2sym = []
         self.sym2idx = OrderedDict()
 
-        with open(vocab_file, 'r', encoding='utf-8') as f:
+        with open(vocab_file, "r", encoding="utf-8") as f:
             for line in f:
                 symb = line.strip().split()[0]
                 self.add_symbol(symb)
-        if '<UNK>' in self.sym2idx:
-            self.unk_idx = self.sym2idx['<UNK>']
-        elif '<unk>' in self.sym2idx:
-            self.unk_idx = self.sym2idx['<unk>']
+        if "<UNK>" in self.sym2idx:
+            self.unk_idx = self.sym2idx["<UNK>"]
+        elif "<unk>" in self.sym2idx:
+            self.unk_idx = self.sym2idx["<unk>"]
         else:
-            raise ValueError('No <unkown> token in vocabulary')
+            raise ValueError("No <unkown> token in vocabulary")
 
     def save_vocabulary(self, vocab_path):
         """Save the tokenizer vocabulary to a directory or file."""
         if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['pretrained_vocab_file'])
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["pretrained_vocab_file"])
         torch.save(self.__dict__, vocab_file)
         return (vocab_file,)
 
     def build_vocab(self):
         if self.vocab_file:
-            logger.info('building vocab from {}'.format(self.vocab_file))
+            logger.info("building vocab from {}".format(self.vocab_file))
             self._build_from_file(self.vocab_file)
-            logger.info('final vocab size {}'.format(len(self)))
+            logger.info("final vocab size {}".format(len(self)))
         else:
-            logger.info('building vocab with min_freq={}, max_size={}'.format(
-                self.min_freq, self.max_size))
+            logger.info("building vocab with min_freq={}, max_size={}".format(self.min_freq, self.max_size))
             self.idx2sym = []
             self.sym2idx = OrderedDict()
 
@@ -167,23 +183,22 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
                 self.add_special(sym)
 
             for sym, cnt in self.counter.most_common(self.max_size):
-                if cnt < self.min_freq: break
+                if cnt < self.min_freq:
+                    break
                 self.add_symbol(sym)
 
-            logger.info('final vocab size {} from {} unique tokens'.format(
-                len(self), len(self.counter)))
+            logger.info("final vocab size {} from {} unique tokens".format(len(self), len(self.counter)))
 
-    def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
-            add_double_eos=False):
-        if verbose: logger.info('encoding file {} ...'.format(path))
+    def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False):
+        if verbose:
+            logger.info("encoding file {} ...".format(path))
         assert os.path.exists(path)
         encoded = []
-        with open(path, 'r', encoding='utf-8') as f:
+        with open(path, "r", encoding="utf-8") as f:
             for idx, line in enumerate(f):
                 if verbose and idx > 0 and idx % 500000 == 0:
-                    logger.info('    line {}'.format(idx))
-                symbols = self.tokenize(line, add_eos=add_eos,
-                    add_double_eos=add_double_eos)
+                    logger.info("    line {}".format(idx))
+                symbols = self.tokenize(line, add_eos=add_eos, add_double_eos=add_double_eos)
                 encoded.append(self.convert_to_tensor(symbols))
 
         if ordered:
@@ -192,11 +207,12 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
         return encoded
 
     def encode_sents(self, sents, ordered=False, verbose=False):
-        if verbose: logger.info('encoding {} sents ...'.format(len(sents)))
+        if verbose:
+            logger.info("encoding {} sents ...".format(len(sents)))
         encoded = []
         for idx, symbols in enumerate(sents):
             if verbose and idx > 0 and idx % 500000 == 0:
-                logger.info('    line {}'.format(idx))
+                logger.info("    line {}".format(idx))
             encoded.append(self.convert_to_tensor(symbols))
 
         if ordered:
@@ -208,7 +224,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
         if sym not in self.sym2idx:
             self.idx2sym.append(sym)
             self.sym2idx[sym] = len(self.idx2sym) - 1
-            setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym])
+            setattr(self, "{}_idx".format(sym.strip("<>")), self.sym2idx[sym])
 
     def add_symbol(self, sym):
         if sym not in self.sym2idx:
@@ -217,7 +233,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
 
     def _convert_id_to_token(self, idx):
         """Converts an id in a token (BPE) using the vocab."""
-        assert 0 <= idx < len(self), 'Index {} out of vocabulary range'.format(idx)
+        assert 0 <= idx < len(self), "Index {} out of vocabulary range".format(idx)
         return self.idx2sym[idx]
 
     def _convert_token_to_id(self, sym):
@@ -227,19 +243,19 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
         else:
             # logger.info('encounter unk {}'.format(sym))
             # assert '<eos>' not in sym
-            if hasattr(self, 'unk_idx'):
+            if hasattr(self, "unk_idx"):
                 return self.sym2idx.get(sym, self.unk_idx)
             # Backward compatibility with pre-trained models
-            elif '<unk>' in self.sym2idx:
-                return self.sym2idx['<unk>']
-            elif '<UNK>' in self.sym2idx:
-                return self.sym2idx['<UNK>']
+            elif "<unk>" in self.sym2idx:
+                return self.sym2idx["<unk>"]
+            elif "<UNK>" in self.sym2idx:
+                return self.sym2idx["<UNK>"]
             else:
-                raise ValueError('Token not in vocabulary and no <unk> token in vocabulary for replacement')
+                raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement")
 
     def convert_tokens_to_string(self, tokens):
         """ Converts a sequence of tokens (string) in a single string. """
-        out_string = ' '.join(tokens).strip()
+        out_string = " ".join(tokens).strip()
         return out_string
 
     def convert_to_tensor(self, symbols):
@@ -256,21 +272,21 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
             line = line.lower()
 
         # empty delimiter '' will evaluate False
-        if self.delimiter == '':
+        if self.delimiter == "":
             symbols = line
         else:
             symbols = line.split(self.delimiter)
 
-        if add_double_eos: # lm1b
-            return ['<S>'] + symbols + ['<S>']
+        if add_double_eos:  # lm1b
+            return ["<S>"] + symbols + ["<S>"]
         elif add_eos:
-            return symbols + ['<eos>']
+            return symbols + ["<eos>"]
         else:
             return symbols
 
 
 class LMOrderedIterator(object):
-    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None):
+    def __init__(self, data, bsz, bptt, device="cpu", ext_len=None):
         """
             data -- LongTensor -- the LongTensor is strictly ordered
         """
@@ -293,14 +309,15 @@ class LMOrderedIterator(object):
         self.n_batch = (self.n_step + self.bptt - 1) // self.bptt
 
     def get_batch(self, i, bptt=None):
-        if bptt is None: bptt = self.bptt
+        if bptt is None:
+            bptt = self.bptt
         seq_len = min(bptt, self.data.size(0) - 1 - i)
 
         end_idx = i + seq_len
         beg_idx = max(0, i - self.ext_len)
 
         data = self.data[beg_idx:end_idx]
-        target = self.data[i+1:i+1+seq_len]
+        target = self.data[i + 1 : i + 1 + seq_len]
 
         data_out = data.transpose(0, 1).contiguous().to(self.device)
         target_out = target.transpose(0, 1).contiguous().to(self.device)
@@ -315,7 +332,7 @@ class LMOrderedIterator(object):
         max_len = self.bptt + max_deviation * std
         i = start
         while True:
-            bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
+            bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.0
             bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
             data, target, seq_len = self.get_batch(i, bptt)
             i += seq_len
@@ -328,7 +345,7 @@ class LMOrderedIterator(object):
 
 
 class LMShuffledIterator(object):
-    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None, shuffle=False):
+    def __init__(self, data, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
         """
             data -- list[LongTensor] -- there is no order among the LongTensors
         """
@@ -343,8 +360,7 @@ class LMShuffledIterator(object):
 
     def get_sent_stream(self):
         # index iterator
-        epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \
-            else np.array(range(len(self.data)))
+        epoch_indices = np.random.permutation(len(self.data)) if self.shuffle else np.array(range(len(self.data)))
 
         # sentence iterator
         for idx in epoch_indices:
@@ -376,10 +392,8 @@ class LMShuffledIterator(object):
                         # number of new tokens to fill in
                         n_new = min(len(streams[i]) - 1, self.bptt - n_filled)
                         # first n_retain tokens are retained from last batch
-                        data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \
-                            streams[i][:n_new]
-                        target[n_filled:n_filled+n_new, i] = \
-                            streams[i][1:n_new+1]
+                        data[n_retain + n_filled : n_retain + n_filled + n_new, i] = streams[i][:n_new]
+                        target[n_filled : n_filled + n_new, i] = streams[i][1 : n_new + 1]
                         streams[i] = streams[i][n_new:]
                         n_filled += n_new
                 except StopIteration:
@@ -408,8 +422,7 @@ class LMShuffledIterator(object):
 
 
 class LMMultiFileIterator(LMShuffledIterator):
-    def __init__(self, paths, vocab, bsz, bptt, device='cpu', ext_len=None,
-        shuffle=False):
+    def __init__(self, paths, vocab, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
 
         self.paths = paths
         self.vocab = vocab
@@ -460,15 +473,16 @@ class TransfoXLCorpus(object):
                 "We assumed '{}' was a path or url but couldn't find files {} "
                 "at this path or url.".format(
                     pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()),
+                    ", ".join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()),
                     pretrained_model_name_or_path,
-                    corpus_file))
+                    corpus_file,
+                )
+            )
             return None
         if resolved_corpus_file == corpus_file:
             logger.info("loading corpus file {}".format(corpus_file))
         else:
-            logger.info("loading corpus file {} from cache at {}".format(
-                corpus_file, resolved_corpus_file))
+            logger.info("loading corpus file {} from cache at {}".format(corpus_file, resolved_corpus_file))
 
         # Instantiate tokenizer.
         corpus = cls(*inputs, **kwargs)
@@ -494,83 +508,78 @@ class TransfoXLCorpus(object):
     def build_corpus(self, path, dataset):
         self.dataset = dataset
 
-        if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']:
-            self.vocab.count_file(os.path.join(path, 'train.txt'))
-            self.vocab.count_file(os.path.join(path, 'valid.txt'))
-            self.vocab.count_file(os.path.join(path, 'test.txt'))
-        elif self.dataset == 'wt103':
-            self.vocab.count_file(os.path.join(path, 'train.txt'))
-        elif self.dataset == 'lm1b':
+        if self.dataset in ["ptb", "wt2", "enwik8", "text8"]:
+            self.vocab.count_file(os.path.join(path, "train.txt"))
+            self.vocab.count_file(os.path.join(path, "valid.txt"))
+            self.vocab.count_file(os.path.join(path, "test.txt"))
+        elif self.dataset == "wt103":
+            self.vocab.count_file(os.path.join(path, "train.txt"))
+        elif self.dataset == "lm1b":
             train_path_pattern = os.path.join(
-                path, '1-billion-word-language-modeling-benchmark-r13output',
-                'training-monolingual.tokenized.shuffled', 'news.en-*')
+                path,
+                "1-billion-word-language-modeling-benchmark-r13output",
+                "training-monolingual.tokenized.shuffled",
+                "news.en-*",
+            )
             train_paths = glob.glob(train_path_pattern)
             # the vocab will load from file when build_vocab() is called
 
         self.vocab.build_vocab()
 
-        if self.dataset in ['ptb', 'wt2', 'wt103']:
-            self.train = self.vocab.encode_file(
-                os.path.join(path, 'train.txt'), ordered=True)
-            self.valid = self.vocab.encode_file(
-                os.path.join(path, 'valid.txt'), ordered=True)
-            self.test = self.vocab.encode_file(
-                os.path.join(path, 'test.txt'), ordered=True)
-        elif self.dataset in ['enwik8', 'text8']:
-            self.train = self.vocab.encode_file(
-                os.path.join(path, 'train.txt'), ordered=True, add_eos=False)
-            self.valid = self.vocab.encode_file(
-                os.path.join(path, 'valid.txt'), ordered=True, add_eos=False)
-            self.test = self.vocab.encode_file(
-                os.path.join(path, 'test.txt'), ordered=True, add_eos=False)
-        elif self.dataset == 'lm1b':
+        if self.dataset in ["ptb", "wt2", "wt103"]:
+            self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True)
+            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True)
+            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True)
+        elif self.dataset in ["enwik8", "text8"]:
+            self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True, add_eos=False)
+            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True, add_eos=False)
+            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False)
+        elif self.dataset == "lm1b":
             self.train = train_paths
-            self.valid = self.vocab.encode_file(
-                os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True)
-            self.test = self.vocab.encode_file(
-                os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True)
+            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=False, add_double_eos=True)
+            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=False, add_double_eos=True)
 
     def get_iterator(self, split, *args, **kwargs):
-        if split == 'train':
-            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
+        if split == "train":
+            if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
                 data_iter = LMOrderedIterator(self.train, *args, **kwargs)
-            elif self.dataset == 'lm1b':
-                kwargs['shuffle'] = True
+            elif self.dataset == "lm1b":
+                kwargs["shuffle"] = True
                 data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
-        elif split in ['valid', 'test']:
-            data = self.valid if split == 'valid' else self.test
-            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
+        elif split in ["valid", "test"]:
+            data = self.valid if split == "valid" else self.test
+            if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
                 data_iter = LMOrderedIterator(data, *args, **kwargs)
-            elif self.dataset == 'lm1b':
+            elif self.dataset == "lm1b":
                 data_iter = LMShuffledIterator(data, *args, **kwargs)
 
         return data_iter
 
 
 def get_lm_corpus(datadir, dataset):
-    fn = os.path.join(datadir, 'cache.pt')
-    fn_pickle = os.path.join(datadir, 'cache.pkl')
+    fn = os.path.join(datadir, "cache.pt")
+    fn_pickle = os.path.join(datadir, "cache.pkl")
     if os.path.exists(fn):
-        logger.info('Loading cached dataset...')
+        logger.info("Loading cached dataset...")
         corpus = torch.load(fn_pickle)
     elif os.path.exists(fn):
-        logger.info('Loading cached dataset from pickle...')
+        logger.info("Loading cached dataset from pickle...")
         with open(fn, "rb") as fp:
             corpus = pickle.load(fp)
     else:
-        logger.info('Producing dataset {}...'.format(dataset))
+        logger.info("Producing dataset {}...".format(dataset))
         kwargs = {}
-        if dataset in ['wt103', 'wt2']:
-            kwargs['special'] = ['<eos>']
-            kwargs['lower_case'] = False
-        elif dataset == 'ptb':
-            kwargs['special'] = ['<eos>']
-            kwargs['lower_case'] = True
-        elif dataset == 'lm1b':
-            kwargs['special'] = []
-            kwargs['lower_case'] = False
-            kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt')
-        elif dataset in ['enwik8', 'text8']:
+        if dataset in ["wt103", "wt2"]:
+            kwargs["special"] = ["<eos>"]
+            kwargs["lower_case"] = False
+        elif dataset == "ptb":
+            kwargs["special"] = ["<eos>"]
+            kwargs["lower_case"] = True
+        elif dataset == "lm1b":
+            kwargs["special"] = []
+            kwargs["lower_case"] = False
+            kwargs["vocab_file"] = os.path.join(datadir, "1b_word_vocab.txt")
+        elif dataset in ["enwik8", "text8"]:
             pass
 
         corpus = TransfoXLCorpus(datadir, dataset, **kwargs)
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 33a59643f5..f848785ee2 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 import os
@@ -34,9 +33,10 @@ if is_torch_available():
 
 logger = logging.getLogger(__name__)
 
-SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
-ADDED_TOKENS_FILE = 'added_tokens.json'
-TOKENIZER_CONFIG_FILE = 'tokenizer_config.json'
+SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
+ADDED_TOKENS_FILE = "added_tokens.json"
+TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
+
 
 class PreTrainedTokenizer(object):
     """ Base class for all tokenizers.
@@ -69,14 +69,22 @@ class PreTrainedTokenizer(object):
 
         - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids``
     """
+
     vocab_files_names = {}
     pretrained_vocab_files_map = {}
     pretrained_init_configuration = {}
     max_model_input_sizes = {}
 
-    SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token", "sep_token",
-                                 "pad_token", "cls_token", "mask_token",
-                                 "additional_special_tokens"]
+    SPECIAL_TOKENS_ATTRIBUTES = [
+        "bos_token",
+        "eos_token",
+        "unk_token",
+        "sep_token",
+        "pad_token",
+        "cls_token",
+        "mask_token",
+        "additional_special_tokens",
+    ]
 
     padding_side = "right"
 
@@ -227,8 +235,8 @@ class PreTrainedTokenizer(object):
         self.max_len = max_len if max_len is not None else int(1e12)
 
         # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed.
-        self.padding_side = kwargs.pop('padding_side', self.padding_side)
-        
+        self.padding_side = kwargs.pop("padding_side", self.padding_side)
+
         # Added tokens
         self.added_tokens_encoder = {}
         self.unique_added_tokens_encoder = set()
@@ -240,13 +248,14 @@ class PreTrainedTokenizer(object):
 
         for key, value in kwargs.items():
             if key in self.SPECIAL_TOKENS_ATTRIBUTES:
-                if key == 'additional_special_tokens':
-                    assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value)
+                if key == "additional_special_tokens":
+                    assert isinstance(value, (list, tuple)) and all(
+                        isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value
+                    )
                 else:
                     assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
                 setattr(self, key, value)
 
-
     @classmethod
     def from_pretrained(cls, *inputs, **kwargs):
         r"""
@@ -302,13 +311,12 @@ class PreTrainedTokenizer(object):
         """
         return cls._from_pretrained(*inputs, **kwargs)
 
-
     @classmethod
     def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
-        cache_dir = kwargs.pop('cache_dir', None)
-        force_download = kwargs.pop('force_download', False)
-        resume_download = kwargs.pop('resume_download', False)
-        proxies = kwargs.pop('proxies', None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
 
         s3_models = list(cls.max_model_input_sizes.keys())
         vocab_files = {}
@@ -317,15 +325,19 @@ class PreTrainedTokenizer(object):
             # Get the vocabulary from AWS S3 bucket
             for file_id, map_list in cls.pretrained_vocab_files_map.items():
                 vocab_files[file_id] = map_list[pretrained_model_name_or_path]
-            if cls.pretrained_init_configuration and pretrained_model_name_or_path in cls.pretrained_init_configuration:
+            if (
+                cls.pretrained_init_configuration
+                and pretrained_model_name_or_path in cls.pretrained_init_configuration
+            ):
                 init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path]
         else:
             # Get the vocabulary from local files
             logger.info(
                 "Model name '{}' not found in model shortcut name list ({}). "
                 "Assuming '{}' is a path or url to a directory containing tokenizer files.".format(
-                    pretrained_model_name_or_path, ', '.join(s3_models),
-                    pretrained_model_name_or_path))
+                    pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path
+                )
+            )
 
             # Look for the tokenizer main vocabulary files
             for file_id, file_name in cls.vocab_files_names.items():
@@ -340,14 +352,15 @@ class PreTrainedTokenizer(object):
                     full_file_name = pretrained_model_name_or_path
                 else:
                     full_file_name = hf_bucket_url(pretrained_model_name_or_path, postfix=file_name)
-                
+
                 vocab_files[file_id] = full_file_name
 
             # Look for the additional tokens files
-            additional_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
-                                      'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE,
-                                      'tokenizer_config_file': TOKENIZER_CONFIG_FILE,
-                                      }
+            additional_files_names = {
+                "added_tokens_file": ADDED_TOKENS_FILE,
+                "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
+                "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
+            }
 
             # If a path to a file was provided, get the parent directory
             saved_directory = pretrained_model_name_or_path
@@ -366,9 +379,12 @@ class PreTrainedTokenizer(object):
                     "Model name '{}' was not found in tokenizers model name list ({}). "
                     "We assumed '{}' was a path or url to a directory containing vocabulary files "
                     "named {} but couldn't find such vocabulary files at this path or url.".format(
-                        pretrained_model_name_or_path, ', '.join(s3_models),
                         pretrained_model_name_or_path,
-                        list(cls.vocab_files_names.values())))
+                        ", ".join(s3_models),
+                        pretrained_model_name_or_path,
+                        list(cls.vocab_files_names.values()),
+                    )
+                )
 
         # Get files from url, cache, or disk depending on the case
         try:
@@ -377,17 +393,27 @@ class PreTrainedTokenizer(object):
                 if file_path is None:
                     resolved_vocab_files[file_id] = None
                 else:
-                    resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download)
+                    resolved_vocab_files[file_id] = cached_path(
+                        file_path,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                        resume_download=resume_download,
+                    )
         except EnvironmentError:
             if pretrained_model_name_or_path in s3_models:
                 msg = "Couldn't reach server at '{}' to download vocabulary files."
             else:
-                msg = "Model name '{}' was not found in tokenizers model name list ({}). " \
-                    "We assumed '{}' was a path or url to a directory containing vocabulary files " \
+                msg = (
+                    "Model name '{}' was not found in tokenizers model name list ({}). "
+                    "We assumed '{}' was a path or url to a directory containing vocabulary files "
                     "named {}, but couldn't find such vocabulary files at this path or url.".format(
-                        pretrained_model_name_or_path, ', '.join(s3_models),
                         pretrained_model_name_or_path,
-                        list(cls.vocab_files_names.values()))
+                        ", ".join(s3_models),
+                        pretrained_model_name_or_path,
+                        list(cls.vocab_files_names.values()),
+                    )
+                )
 
             raise EnvironmentError(msg)
 
@@ -395,16 +421,15 @@ class PreTrainedTokenizer(object):
             if file_path == resolved_vocab_files[file_id]:
                 logger.info("loading file {}".format(file_path))
             else:
-                logger.info("loading file {} from cache at {}".format(
-                    file_path, resolved_vocab_files[file_id]))
+                logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id]))
 
         # Prepare tokenizer initialization kwargs
         # Did we saved some inputs and kwargs to reload ?
-        tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None)
+        tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
         if tokenizer_config_file is not None:
             with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
                 init_kwargs = json.load(tokenizer_config_handle)
-            saved_init_inputs = init_kwargs.pop('init_inputs', ())
+            saved_init_inputs = init_kwargs.pop("init_inputs", ())
             if not init_inputs:
                 init_inputs = saved_init_inputs
         else:
@@ -419,11 +444,11 @@ class PreTrainedTokenizer(object):
             # wont index sequences longer than the number of positional embeddings
             max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
             if max_len is not None and isinstance(max_len, (int, float)):
-                init_kwargs['max_len'] = min(init_kwargs.get('max_len', int(1e12)), max_len)
+                init_kwargs["max_len"] = min(init_kwargs.get("max_len", int(1e12)), max_len)
 
         # Merge resolved_vocab_files arguments in init_kwargs.
-        added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None)
-        special_tokens_map_file = resolved_vocab_files.pop('special_tokens_map_file', None)
+        added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
+        special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
         for args_name, file_path in resolved_vocab_files.items():
             if args_name not in init_kwargs:
                 init_kwargs[args_name] = file_path
@@ -438,8 +463,10 @@ class PreTrainedTokenizer(object):
         try:
             tokenizer = cls(*init_inputs, **init_kwargs)
         except OSError:
-            OSError("Unable to load vocabulary from file. "
-                    "Please check that the provided vocabulary is accessible and not corrupted.")
+            OSError(
+                "Unable to load vocabulary from file. "
+                "Please check that the provided vocabulary is accessible and not corrupted."
+            )
 
         # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
         tokenizer.init_inputs = init_inputs
@@ -449,13 +476,12 @@ class PreTrainedTokenizer(object):
         if added_tokens_file is not None:
             with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
                 added_tok_encoder = json.load(added_tokens_handle)
-            added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
+            added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
             tokenizer.added_tokens_encoder.update(added_tok_encoder)
             tokenizer.added_tokens_decoder.update(added_tok_decoder)
 
         return tokenizer
 
-
     def save_pretrained(self, save_directory):
         """ Save the tokenizer vocabulary files together with:
                 - added tokens,
@@ -476,28 +502,27 @@ class PreTrainedTokenizer(object):
         tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
 
         tokenizer_config = copy.deepcopy(self.init_kwargs)
-        tokenizer_config['init_inputs'] = copy.deepcopy(self.init_inputs)
+        tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
         for file_id in self.vocab_files_names.keys():
             tokenizer_config.pop(file_id, None)
 
-        with open(tokenizer_config_file, 'w', encoding='utf-8') as f:
+        with open(tokenizer_config_file, "w", encoding="utf-8") as f:
             f.write(json.dumps(tokenizer_config, ensure_ascii=False))
 
-        with open(special_tokens_map_file, 'w', encoding='utf-8') as f:
+        with open(special_tokens_map_file, "w", encoding="utf-8") as f:
             f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
 
-        with open(added_tokens_file, 'w', encoding='utf-8') as f:
+        with open(added_tokens_file, "w", encoding="utf-8") as f:
             if self.added_tokens_encoder:
                 out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False)
             else:
-                out_str = u"{}"
+                out_str = "{}"
             f.write(out_str)
 
         vocab_files = self.save_vocabulary(save_directory)
 
         return vocab_files + (special_tokens_map_file, added_tokens_file)
 
-
     def save_vocabulary(self, save_directory):
         """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
             and special token mappings.
@@ -506,17 +531,14 @@ class PreTrainedTokenizer(object):
         """
         raise NotImplementedError
 
-
     def vocab_size(self):
         """ Size of the base vocabulary (without the added tokens) """
         raise NotImplementedError
 
-
     def __len__(self):
         """ Size of the full vocabulary with the added tokens """
         return self.vocab_size + len(self.added_tokens_encoder)
 
-
     def add_tokens(self, new_tokens):
         """
         Add a list of new tokens to the tokenizer class. If the new tokens are not in the
@@ -544,16 +566,18 @@ class PreTrainedTokenizer(object):
         to_add_tokens = []
         for token in new_tokens:
             assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
-            if self.init_kwargs.get('do_lower_case', False) and token not in self.all_special_tokens:
+            if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens:
                 token = token.lower()
-            if token != self.unk_token and \
-                    self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \
-                    token not in to_add_tokens:
+            if (
+                token != self.unk_token
+                and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
+                and token not in to_add_tokens
+            ):
                 to_add_tokens.append(token)
                 logger.info("Adding %s to the vocabulary", token)
 
         added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(to_add_tokens))
-        added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
+        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
         self.added_tokens_encoder.update(added_tok_encoder)
         self.unique_added_tokens_encoder = set(self.added_tokens_encoder.keys()).union(set(self.all_special_tokens))
         self.added_tokens_decoder.update(added_tok_decoder)
@@ -622,8 +646,10 @@ class PreTrainedTokenizer(object):
         added_tokens = 0
         for key, value in special_tokens_dict.items():
             assert key in self.SPECIAL_TOKENS_ATTRIBUTES
-            if key == 'additional_special_tokens':
-                assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value)
+            if key == "additional_special_tokens":
+                assert isinstance(value, (list, tuple)) and all(
+                    isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value
+                )
                 added_tokens += self.add_tokens(value)
             else:
                 assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
@@ -633,7 +659,6 @@ class PreTrainedTokenizer(object):
 
         return added_tokens
 
-
     def tokenize(self, text, **kwargs):
         """ Converts a string in a sequence of tokens (string), using the tokenizer.
             Split in words for word-based vocabulary or sub-words for sub-word-based
@@ -649,14 +674,10 @@ class PreTrainedTokenizer(object):
         def lowercase_text(t):
             # convert non-special tokens to lowercase
             escaped_special_toks = [re.escape(s_tok) for s_tok in all_special_tokens]
-            pattern = r'(' + r'|'.join(escaped_special_toks) + r')|' + \
-                      r'(.+?)'
-            return re.sub(
-                pattern,
-                lambda m: m.groups()[0] or m.groups()[1].lower(),
-                t)
+            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
+            return re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), t)
 
-        if self.init_kwargs.get('do_lower_case', False):
+        if self.init_kwargs.get("do_lower_case", False):
             text = lowercase_text(text)
 
         def split_on_token(tok, text):
@@ -694,9 +715,14 @@ class PreTrainedTokenizer(object):
                         tokenized_text += [sub_text]
                 text_list = tokenized_text
 
-            return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) \
-                    if token not in self.unique_added_tokens_encoder
-                    else [token] for token in tokenized_text)))
+            return list(
+                itertools.chain.from_iterable(
+                    (
+                        self._tokenize(token, **kwargs) if token not in self.unique_added_tokens_encoder else [token]
+                        for token in tokenized_text
+                    )
+                )
+            )
 
         added_tokens = self.unique_added_tokens_encoder
         tokenized_text = split_on_tokens(added_tokens, text)
@@ -737,16 +763,18 @@ class PreTrainedTokenizer(object):
     def _convert_token_to_id(self, token):
         raise NotImplementedError
 
-    def encode(self,
-               text,
-               text_pair=None,
-               add_special_tokens=True,
-               max_length=None,
-               stride=0,
-               truncation_strategy='longest_first',
-               pad_to_max_length=False,
-               return_tensors=None,
-               **kwargs):
+    def encode(
+        self,
+        text,
+        text_pair=None,
+        add_special_tokens=True,
+        max_length=None,
+        stride=0,
+        truncation_strategy="longest_first",
+        pad_to_max_length=False,
+        return_tensors=None,
+        **kwargs
+    ):
         """
         Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
 
@@ -781,32 +809,36 @@ class PreTrainedTokenizer(object):
                 or PyTorch torch.Tensor instead of a list of python integers.
             **kwargs: passed to the `self.tokenize()` method
         """
-        encoded_inputs = self.encode_plus(text,
-                                          text_pair=text_pair,
-                                          max_length=max_length,
-                                          add_special_tokens=add_special_tokens,
-                                          stride=stride,
-                                          truncation_strategy=truncation_strategy,
-                                          pad_to_max_length=pad_to_max_length,
-                                          return_tensors=return_tensors,
-                                          **kwargs)
+        encoded_inputs = self.encode_plus(
+            text,
+            text_pair=text_pair,
+            max_length=max_length,
+            add_special_tokens=add_special_tokens,
+            stride=stride,
+            truncation_strategy=truncation_strategy,
+            pad_to_max_length=pad_to_max_length,
+            return_tensors=return_tensors,
+            **kwargs
+        )
 
         return encoded_inputs["input_ids"]
 
-    def encode_plus(self,
-                    text,
-                    text_pair=None,
-                    add_special_tokens=True,
-                    max_length=None,
-                    stride=0,
-                    truncation_strategy='longest_first',
-                    pad_to_max_length=False,
-                    return_tensors=None,
-                    return_token_type_ids=True,
-                    return_attention_mask=True,
-                    return_overflowing_tokens=False,
-                    return_special_tokens_mask=False,
-                    **kwargs):
+    def encode_plus(
+        self,
+        text,
+        text_pair=None,
+        add_special_tokens=True,
+        max_length=None,
+        stride=0,
+        truncation_strategy="longest_first",
+        pad_to_max_length=False,
+        return_tensors=None,
+        return_token_type_ids=True,
+        return_attention_mask=True,
+        return_overflowing_tokens=False,
+        return_special_tokens_mask=False,
+        **kwargs
+    ):
         """
         Returns a dictionary containing the encoded sequence or sequence pair and additional informations:
         the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
@@ -874,34 +906,40 @@ class PreTrainedTokenizer(object):
             elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
                 return text
             else:
-                raise ValueError("Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.")
+                raise ValueError(
+                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
+                )
 
         first_ids = get_input_ids(text)
         second_ids = get_input_ids(text_pair) if text_pair is not None else None
 
-        return self.prepare_for_model(first_ids,
-                                      pair_ids=second_ids,
-                                      max_length=max_length,
-                                      pad_to_max_length=pad_to_max_length,
-                                      add_special_tokens=add_special_tokens,
-                                      stride=stride,
-                                      truncation_strategy=truncation_strategy,
-                                      return_tensors=return_tensors,
-                                      return_attention_mask=return_attention_mask,
-                                      return_token_type_ids=return_token_type_ids,
-                                      return_overflowing_tokens=return_overflowing_tokens,
-                                      return_special_tokens_mask=return_special_tokens_mask)
+        return self.prepare_for_model(
+            first_ids,
+            pair_ids=second_ids,
+            max_length=max_length,
+            pad_to_max_length=pad_to_max_length,
+            add_special_tokens=add_special_tokens,
+            stride=stride,
+            truncation_strategy=truncation_strategy,
+            return_tensors=return_tensors,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+        )
 
-    def batch_encode_plus(self,
-                          batch_text_or_text_pairs=None,
-                          add_special_tokens=False,
-                          max_length=None,
-                          stride=0,
-                          truncation_strategy='longest_first',
-                          return_tensors=None,
-                          return_input_lengths=False,
-                          return_attention_masks=False,
-                          **kwargs):
+    def batch_encode_plus(
+        self,
+        batch_text_or_text_pairs=None,
+        add_special_tokens=False,
+        max_length=None,
+        stride=0,
+        truncation_strategy="longest_first",
+        return_tensors=None,
+        return_input_lengths=False,
+        return_attention_masks=False,
+        **kwargs
+    ):
         """
         Returns a dictionary containing the encoded sequence or sequence pair and additional information:
         the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
@@ -933,12 +971,19 @@ class PreTrainedTokenizer(object):
                 ids, pair_ids = ids_or_pair_ids
             else:
                 ids, pair_ids = ids_or_pair_ids, None
-            outputs = self.encode_plus(ids, pair_ids, add_special_tokens=add_special_tokens, max_length=max_length,
-                                       stride=stride, truncation_strategy=truncation_strategy, return_tensors=None)
+            outputs = self.encode_plus(
+                ids,
+                pair_ids,
+                add_special_tokens=add_special_tokens,
+                max_length=max_length,
+                stride=stride,
+                truncation_strategy=truncation_strategy,
+                return_tensors=None,
+            )
 
             # Append the non-padded length to the output
             if return_input_lengths:
-                outputs['input_len'] = len(outputs['input_ids'])
+                outputs["input_len"] = len(outputs["input_ids"])
 
             for key, value in outputs.items():
                 if key not in batch_outputs:
@@ -946,11 +991,11 @@ class PreTrainedTokenizer(object):
                 batch_outputs[key].append(value)
 
         # Compute longest sequence size
-        max_seq_len = max(map(len, batch_outputs['input_ids']))
+        max_seq_len = max(map(len, batch_outputs["input_ids"]))
 
         if return_attention_masks:
             # Allow the model to not give any special attention to padded input
-            batch_outputs['attention_mask'] = [[0] * len(v) for v in batch_outputs['input_ids']]
+            batch_outputs["attention_mask"] = [[0] * len(v) for v in batch_outputs["input_ids"]]
 
         if return_tensors is not None:
 
@@ -958,34 +1003,48 @@ class PreTrainedTokenizer(object):
             for key, value in batch_outputs.items():
 
                 padded_value = value
-                if key != 'input_len':
+                if key != "input_len":
                     # Padding handle
-                    padded_value = [v + [self.pad_token_id if key == 'input_ids' else 1] * (max_seq_len - len(v)) for v in padded_value]
+                    padded_value = [
+                        v + [self.pad_token_id if key == "input_ids" else 1] * (max_seq_len - len(v))
+                        for v in padded_value
+                    ]
 
-                if return_tensors == 'tf' and is_tf_available():
+                if return_tensors == "tf" and is_tf_available():
                     batch_outputs[key] = tf.constant(padded_value)
-                elif return_tensors == 'pt' and is_torch_available():
+                elif return_tensors == "pt" and is_torch_available():
                     batch_outputs[key] = torch.tensor(padded_value)
                 elif return_tensors is not None:
-                    logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors))
+                    logger.warning(
+                        "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
+                            return_tensors
+                        )
+                    )
 
         # encoder_attention_mask requires 1 for real token, 0 for padding, just invert value
         if return_attention_masks:
             if is_tf_available():
-                batch_outputs['attention_mask'] = tf.abs(batch_outputs['attention_mask'] - 1)
+                batch_outputs["attention_mask"] = tf.abs(batch_outputs["attention_mask"] - 1)
             else:
-                batch_outputs['attention_mask'] = torch.abs(batch_outputs['attention_mask'] - 1)
+                batch_outputs["attention_mask"] = torch.abs(batch_outputs["attention_mask"] - 1)
 
         return batch_outputs
 
-    def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0,
-                          truncation_strategy='longest_first',
-                          pad_to_max_length=False,
-                          return_tensors=None,
-                          return_token_type_ids=True,
-                          return_attention_mask=True,
-                          return_overflowing_tokens=False,
-                          return_special_tokens_mask=False):
+    def prepare_for_model(
+        self,
+        ids,
+        pair_ids=None,
+        max_length=None,
+        add_special_tokens=True,
+        stride=0,
+        truncation_strategy="longest_first",
+        pad_to_max_length=False,
+        return_tensors=None,
+        return_token_type_ids=True,
+        return_attention_mask=True,
+        return_overflowing_tokens=False,
+        return_special_tokens_mask=False,
+    ):
         """
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
         It adds special tokens, truncates
@@ -1050,10 +1109,13 @@ class PreTrainedTokenizer(object):
         # Handle max sequence length
         total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0)
         if max_length and total_len > max_length:
-            ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids,
-                                                                        num_tokens_to_remove=total_len-max_length,
-                                                                        truncation_strategy=truncation_strategy,
-                                                                        stride=stride)
+            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
+                ids,
+                pair_ids=pair_ids,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
             if return_overflowing_tokens:
                 encoded_inputs["overflowing_tokens"] = overflowing_tokens
                 encoded_inputs["num_truncated_tokens"] = total_len - max_length
@@ -1081,54 +1143,64 @@ class PreTrainedTokenizer(object):
                 encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
 
         if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len:
-            logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
-                           "for this model ({} > {}). Running this sequence through the model will result in "
-                           "indexing errors".format(len(ids), self.max_len))
-                           
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum sequence length "
+                "for this model ({} > {}). Running this sequence through the model will result in "
+                "indexing errors".format(len(ids), self.max_len)
+            )
+
         needs_to_be_padded = pad_to_max_length and (
-            max_length and len(encoded_inputs["input_ids"]) < max_length
-            or 
-            max_length is None and len(encoded_inputs["input_ids"]) < self.max_len and self.max_len <= 10000
+            max_length
+            and len(encoded_inputs["input_ids"]) < max_length
+            or max_length is None
+            and len(encoded_inputs["input_ids"]) < self.max_len
+            and self.max_len <= 10000
         )
 
         if pad_to_max_length and max_length is None and self.max_len > 10000:
-            logger.warning("Sequence can't be padded as no maximum length is specified and the model maximum length is too high.")
+            logger.warning(
+                "Sequence can't be padded as no maximum length is specified and the model maximum length is too high."
+            )
 
         if needs_to_be_padded:
             difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"])
 
-            if self.padding_side == 'right':
+            if self.padding_side == "right":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference
                 if return_token_type_ids:
-                    encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    encoded_inputs["token_type_ids"] = (
+                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    )
                 if return_special_tokens_mask:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
-            elif self.padding_side == 'left':
+            elif self.padding_side == "left":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"])
                 if return_token_type_ids:
-                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"]
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                        "token_type_ids"
+                    ]
                 if return_special_tokens_mask:
                     encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
                 encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
 
             else:
                 raise ValueError("Invalid padding strategy:" + str(self.padding_side))
-            
+
         elif return_attention_mask:
             encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
 
         # Prepare inputs as tensors if asked
-        if return_tensors == 'tf' and is_tf_available():
+        if return_tensors == "tf" and is_tf_available():
             encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]])
             encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]])
 
             if "attention_mask" in encoded_inputs:
                 encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]])
 
-        elif return_tensors == 'pt' and is_torch_available():
+        elif return_tensors == "pt" and is_torch_available():
             encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])
             encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])
 
@@ -1137,11 +1209,15 @@ class PreTrainedTokenizer(object):
         elif return_tensors is not None:
             logger.warning(
                 "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
-                    return_tensors))
+                    return_tensors
+                )
+            )
 
         return encoded_inputs
 
-    def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
+    def truncate_sequences(
+        self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy="longest_first", stride=0
+    ):
         """Truncates a sequence pair in place to the maximum length.
             truncation_strategy: string selected in the following options:
                 - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
@@ -1154,7 +1230,7 @@ class PreTrainedTokenizer(object):
         if num_tokens_to_remove <= 0:
             return ids, pair_ids, []
 
-        if truncation_strategy == 'longest_first':
+        if truncation_strategy == "longest_first":
             overflowing_tokens = []
             for _ in range(num_tokens_to_remove):
                 if pair_ids is None or len(ids) > len(pair_ids):
@@ -1165,20 +1241,22 @@ class PreTrainedTokenizer(object):
             window_len = min(len(ids), stride)
             if window_len > 0:
                 overflowing_tokens = ids[-window_len:] + overflowing_tokens
-        elif truncation_strategy == 'only_first':
+        elif truncation_strategy == "only_first":
             assert len(ids) > num_tokens_to_remove
             window_len = min(len(ids), stride + num_tokens_to_remove)
             overflowing_tokens = ids[-window_len:]
             ids = ids[:-num_tokens_to_remove]
-        elif truncation_strategy == 'only_second':
+        elif truncation_strategy == "only_second":
             assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove
             window_len = min(len(pair_ids), stride + num_tokens_to_remove)
             overflowing_tokens = pair_ids[-window_len:]
             pair_ids = pair_ids[:-num_tokens_to_remove]
-        elif truncation_strategy == 'do_not_truncate':
+        elif truncation_strategy == "do_not_truncate":
             raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.")
         else:
-            raise ValueError("Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']")
+            raise ValueError(
+                "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']"
+            )
         return (ids, pair_ids, overflowing_tokens)
 
     def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
@@ -1246,7 +1324,7 @@ class PreTrainedTokenizer(object):
             The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids))
             but we often want to remove sub-word tokenization artifacts at the same time.
         """
-        return ' '.join(self.convert_ids_to_tokens(tokens))
+        return " ".join(self.convert_ids_to_tokens(tokens))
 
     def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
         """
@@ -1278,7 +1356,7 @@ class PreTrainedTokenizer(object):
                 current_sub_text.append(token)
         if current_sub_text:
             sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-        text = ' '.join(sub_texts)
+        text = " ".join(sub_texts)
 
         if clean_up_tokenization_spaces:
             clean_text = self.clean_up_tokenization(text)
@@ -1323,7 +1401,17 @@ class PreTrainedTokenizer(object):
     def clean_up_tokenization(out_string):
         """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
         """
-        out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
-                        ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
-                        ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
+        out_string = (
+            out_string.replace(" .", ".")
+            .replace(" ?", "?")
+            .replace(" !", "!")
+            .replace(" ,", ",")
+            .replace(" ' ", "'")
+            .replace(" n't", "n't")
+            .replace(" 'm", "'m")
+            .replace(" do not", " don't")
+            .replace(" 's", "'s")
+            .replace(" 've", "'ve")
+            .replace(" 're", "'re")
+        )
         return out_string
diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py
index 8def80bec4..9b96b92f23 100644
--- a/transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for XLM."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import json
 import logging
@@ -32,386 +31,402 @@ from .tokenization_bert import BasicTokenizer
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {
-    'vocab_file': 'vocab.json',
-    'merges_file': 'merges.txt',
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json",
-        'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-vocab.json",
-        'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-vocab.json",
-        'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-vocab.json",
-        'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-vocab.json",
-        'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json",
-        'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json",
-        'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json",
-        'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json",
-        'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-vocab.json",
+    "vocab_file": {
+        "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json",
+        "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-vocab.json",
+        "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-vocab.json",
+        "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-vocab.json",
+        "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-vocab.json",
+        "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json",
+        "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json",
+        "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json",
+        "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json",
+        "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-vocab.json",
     },
-    'merges_file':
-    {
-        'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
-        'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
-        'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
-        'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-merges.txt",
-        'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-merges.txt",
-        'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt",
-        'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
-        'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
-        'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt",
-        'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-merges.txt",
+    "merges_file": {
+        "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
+        "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
+        "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
+        "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-merges.txt",
+        "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-merges.txt",
+        "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt",
+        "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
+        "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
+        "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt",
+        "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-merges.txt",
     },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'xlm-mlm-en-2048': 512,
-    'xlm-mlm-ende-1024': 512,
-    'xlm-mlm-enfr-1024': 512,
-    'xlm-mlm-enro-1024': 512,
-    'xlm-mlm-tlm-xnli15-1024': 512,
-    'xlm-mlm-xnli15-1024': 512,
-    'xlm-clm-enfr-1024': 512,
-    'xlm-clm-ende-1024': 512,
-    'xlm-mlm-17-1280': 512,
-    'xlm-mlm-100-1280': 512,
+    "xlm-mlm-en-2048": 512,
+    "xlm-mlm-ende-1024": 512,
+    "xlm-mlm-enfr-1024": 512,
+    "xlm-mlm-enro-1024": 512,
+    "xlm-mlm-tlm-xnli15-1024": 512,
+    "xlm-mlm-xnli15-1024": 512,
+    "xlm-clm-enfr-1024": 512,
+    "xlm-clm-ende-1024": 512,
+    "xlm-mlm-17-1280": 512,
+    "xlm-mlm-100-1280": 512,
 }
 
 PRETRAINED_INIT_CONFIGURATION = {
-    'xlm-mlm-en-2048': {"do_lowercase_and_remove_accent": True},
-    'xlm-mlm-ende-1024': { "do_lowercase_and_remove_accent": True,
-                            "id2lang": { "0": "de",
-                                        "1": "en"},
-                           "lang2id": { "de": 0,
-                                        "en": 1 }},
-    'xlm-mlm-enfr-1024': { "do_lowercase_and_remove_accent": True,
-                           "id2lang": { "0": "en",
-                                        "1": "fr"},
-                           "lang2id": { "en": 0,
-                                        "fr": 1 }},
-    'xlm-mlm-enro-1024': { "do_lowercase_and_remove_accent": True,
-                           "id2lang": { "0": "en",
-                                        "1": "ro"},
-                           "lang2id": { "en": 0,
-                                        "ro": 1 }},
-    'xlm-mlm-tlm-xnli15-1024': { "do_lowercase_and_remove_accent": True,
-                                 "id2lang": {   "0": "ar",
-                                                "1": "bg",
-                                                "2": "de",
-                                                "3": "el",
-                                                "4": "en",
-                                                "5": "es",
-                                                "6": "fr",
-                                                "7": "hi",
-                                                "8": "ru",
-                                                "9": "sw",
-                                                "10": "th",
-                                                "11": "tr",
-                                                "12": "ur",
-                                                "13": "vi",
-                                                "14": "zh"},
-                                 "lang2id": {   "ar": 0,
-                                                "bg": 1,
-                                                "de": 2,
-                                                "el": 3,
-                                                "en": 4,
-                                                "es": 5,
-                                                "fr": 6,
-                                                "hi": 7,
-                                                "ru": 8,
-                                                "sw": 9,
-                                                "th": 10,
-                                                "tr": 11,
-                                                "ur": 12,
-                                                "vi": 13,
-                                                "zh": 14 }},
-    'xlm-mlm-xnli15-1024': { "do_lowercase_and_remove_accent": True,
-                             "id2lang": {   "0": "ar",
-                                                "1": "bg",
-                                                "2": "de",
-                                                "3": "el",
-                                                "4": "en",
-                                                "5": "es",
-                                                "6": "fr",
-                                                "7": "hi",
-                                                "8": "ru",
-                                                "9": "sw",
-                                                "10": "th",
-                                                "11": "tr",
-                                                "12": "ur",
-                                                "13": "vi",
-                                                "14": "zh"},
-                                 "lang2id": {   "ar": 0,
-                                                "bg": 1,
-                                                "de": 2,
-                                                "el": 3,
-                                                "en": 4,
-                                                "es": 5,
-                                                "fr": 6,
-                                                "hi": 7,
-                                                "ru": 8,
-                                                "sw": 9,
-                                                "th": 10,
-                                                "tr": 11,
-                                                "ur": 12,
-                                                "vi": 13,
-                                                "zh": 14 }},
-    'xlm-clm-enfr-1024': { "do_lowercase_and_remove_accent": True,
-                           "id2lang": { "0": "en",
-                                        "1": "fr"},
-                           "lang2id": { "en": 0,
-                                        "fr": 1 }},
-    'xlm-clm-ende-1024': { "do_lowercase_and_remove_accent": True,
-                           "id2lang": { "0": "de",
-                                        "1": "en"},
-                           "lang2id": { "de": 0,
-                                        "en": 1 }},
-    'xlm-mlm-17-1280': {"do_lowercase_and_remove_accent": False,
-                        "id2lang": {
-                            "0": "ar",
-                            "1": "de",
-                            "2": "en",
-                            "3": "es",
-                            "4": "fr",
-                            "5": "hi",
-                            "6": "it",
-                            "7": "ja",
-                            "8": "ko",
-                            "9": "nl",
-                            "10": "pl",
-                            "11": "pt",
-                            "12": "ru",
-                            "13": "sv",
-                            "14": "tr",
-                            "15": "vi",
-                            "16": "zh"
-                        },
-                        "lang2id": {
-                            "ar": 0,
-                            "de": 1,
-                            "en": 2,
-                            "es": 3,
-                            "fr": 4,
-                            "hi": 5,
-                            "it": 6,
-                            "ja": 7,
-                            "ko": 8,
-                            "nl": 9,
-                            "pl": 10,
-                            "pt": 11,
-                            "ru": 12,
-                            "sv": 13,
-                            "tr": 14,
-                            "vi": 15,
-                            "zh": 16}},
-    'xlm-mlm-100-1280': {"do_lowercase_and_remove_accent": False,
-                        "id2lang": {
-                            "0": "af",
-                            "1": "als",
-                            "2": "am",
-                            "3": "an",
-                            "4": "ang",
-                            "5": "ar",
-                            "6": "arz",
-                            "7": "ast",
-                            "8": "az",
-                            "9": "bar",
-                            "10": "be",
-                            "11": "bg",
-                            "12": "bn",
-                            "13": "br",
-                            "14": "bs",
-                            "15": "ca",
-                            "16": "ceb",
-                            "17": "ckb",
-                            "18": "cs",
-                            "19": "cy",
-                            "20": "da",
-                            "21": "de",
-                            "22": "el",
-                            "23": "en",
-                            "24": "eo",
-                            "25": "es",
-                            "26": "et",
-                            "27": "eu",
-                            "28": "fa",
-                            "29": "fi",
-                            "30": "fr",
-                            "31": "fy",
-                            "32": "ga",
-                            "33": "gan",
-                            "34": "gl",
-                            "35": "gu",
-                            "36": "he",
-                            "37": "hi",
-                            "38": "hr",
-                            "39": "hu",
-                            "40": "hy",
-                            "41": "ia",
-                            "42": "id",
-                            "43": "is",
-                            "44": "it",
-                            "45": "ja",
-                            "46": "jv",
-                            "47": "ka",
-                            "48": "kk",
-                            "49": "kn",
-                            "50": "ko",
-                            "51": "ku",
-                            "52": "la",
-                            "53": "lb",
-                            "54": "lt",
-                            "55": "lv",
-                            "56": "mk",
-                            "57": "ml",
-                            "58": "mn",
-                            "59": "mr",
-                            "60": "ms",
-                            "61": "my",
-                            "62": "nds",
-                            "63": "ne",
-                            "64": "nl",
-                            "65": "nn",
-                            "66": "no",
-                            "67": "oc",
-                            "68": "pl",
-                            "69": "pt",
-                            "70": "ro",
-                            "71": "ru",
-                            "72": "scn",
-                            "73": "sco",
-                            "74": "sh",
-                            "75": "si",
-                            "76": "simple",
-                            "77": "sk",
-                            "78": "sl",
-                            "79": "sq",
-                            "80": "sr",
-                            "81": "sv",
-                            "82": "sw",
-                            "83": "ta",
-                            "84": "te",
-                            "85": "th",
-                            "86": "tl",
-                            "87": "tr",
-                            "88": "tt",
-                            "89": "uk",
-                            "90": "ur",
-                            "91": "uz",
-                            "92": "vi",
-                            "93": "war",
-                            "94": "wuu",
-                            "95": "yi",
-                            "96": "zh",
-                            "97": "zh_classical",
-                            "98": "zh_min_nan",
-                            "99": "zh_yue"
-                        },
-                        "lang2id": {
-                            "af": 0,
-                            "als": 1,
-                            "am": 2,
-                            "an": 3,
-                            "ang": 4,
-                            "ar": 5,
-                            "arz": 6,
-                            "ast": 7,
-                            "az": 8,
-                            "bar": 9,
-                            "be": 10,
-                            "bg": 11,
-                            "bn": 12,
-                            "br": 13,
-                            "bs": 14,
-                            "ca": 15,
-                            "ceb": 16,
-                            "ckb": 17,
-                            "cs": 18,
-                            "cy": 19,
-                            "da": 20,
-                            "de": 21,
-                            "el": 22,
-                            "en": 23,
-                            "eo": 24,
-                            "es": 25,
-                            "et": 26,
-                            "eu": 27,
-                            "fa": 28,
-                            "fi": 29,
-                            "fr": 30,
-                            "fy": 31,
-                            "ga": 32,
-                            "gan": 33,
-                            "gl": 34,
-                            "gu": 35,
-                            "he": 36,
-                            "hi": 37,
-                            "hr": 38,
-                            "hu": 39,
-                            "hy": 40,
-                            "ia": 41,
-                            "id": 42,
-                            "is": 43,
-                            "it": 44,
-                            "ja": 45,
-                            "jv": 46,
-                            "ka": 47,
-                            "kk": 48,
-                            "kn": 49,
-                            "ko": 50,
-                            "ku": 51,
-                            "la": 52,
-                            "lb": 53,
-                            "lt": 54,
-                            "lv": 55,
-                            "mk": 56,
-                            "ml": 57,
-                            "mn": 58,
-                            "mr": 59,
-                            "ms": 60,
-                            "my": 61,
-                            "nds": 62,
-                            "ne": 63,
-                            "nl": 64,
-                            "nn": 65,
-                            "no": 66,
-                            "oc": 67,
-                            "pl": 68,
-                            "pt": 69,
-                            "ro": 70,
-                            "ru": 71,
-                            "scn": 72,
-                            "sco": 73,
-                            "sh": 74,
-                            "si": 75,
-                            "simple": 76,
-                            "sk": 77,
-                            "sl": 78,
-                            "sq": 79,
-                            "sr": 80,
-                            "sv": 81,
-                            "sw": 82,
-                            "ta": 83,
-                            "te": 84,
-                            "th": 85,
-                            "tl": 86,
-                            "tr": 87,
-                            "tt": 88,
-                            "uk": 89,
-                            "ur": 90,
-                            "uz": 91,
-                            "vi": 92,
-                            "war": 93,
-                            "wuu": 94,
-                            "yi": 95,
-                            "zh": 96,
-                            "zh_classical": 97,
-                            "zh_min_nan": 98,
-                            "zh_yue": 99
-                        }},
+    "xlm-mlm-en-2048": {"do_lowercase_and_remove_accent": True},
+    "xlm-mlm-ende-1024": {
+        "do_lowercase_and_remove_accent": True,
+        "id2lang": {"0": "de", "1": "en"},
+        "lang2id": {"de": 0, "en": 1},
+    },
+    "xlm-mlm-enfr-1024": {
+        "do_lowercase_and_remove_accent": True,
+        "id2lang": {"0": "en", "1": "fr"},
+        "lang2id": {"en": 0, "fr": 1},
+    },
+    "xlm-mlm-enro-1024": {
+        "do_lowercase_and_remove_accent": True,
+        "id2lang": {"0": "en", "1": "ro"},
+        "lang2id": {"en": 0, "ro": 1},
+    },
+    "xlm-mlm-tlm-xnli15-1024": {
+        "do_lowercase_and_remove_accent": True,
+        "id2lang": {
+            "0": "ar",
+            "1": "bg",
+            "2": "de",
+            "3": "el",
+            "4": "en",
+            "5": "es",
+            "6": "fr",
+            "7": "hi",
+            "8": "ru",
+            "9": "sw",
+            "10": "th",
+            "11": "tr",
+            "12": "ur",
+            "13": "vi",
+            "14": "zh",
+        },
+        "lang2id": {
+            "ar": 0,
+            "bg": 1,
+            "de": 2,
+            "el": 3,
+            "en": 4,
+            "es": 5,
+            "fr": 6,
+            "hi": 7,
+            "ru": 8,
+            "sw": 9,
+            "th": 10,
+            "tr": 11,
+            "ur": 12,
+            "vi": 13,
+            "zh": 14,
+        },
+    },
+    "xlm-mlm-xnli15-1024": {
+        "do_lowercase_and_remove_accent": True,
+        "id2lang": {
+            "0": "ar",
+            "1": "bg",
+            "2": "de",
+            "3": "el",
+            "4": "en",
+            "5": "es",
+            "6": "fr",
+            "7": "hi",
+            "8": "ru",
+            "9": "sw",
+            "10": "th",
+            "11": "tr",
+            "12": "ur",
+            "13": "vi",
+            "14": "zh",
+        },
+        "lang2id": {
+            "ar": 0,
+            "bg": 1,
+            "de": 2,
+            "el": 3,
+            "en": 4,
+            "es": 5,
+            "fr": 6,
+            "hi": 7,
+            "ru": 8,
+            "sw": 9,
+            "th": 10,
+            "tr": 11,
+            "ur": 12,
+            "vi": 13,
+            "zh": 14,
+        },
+    },
+    "xlm-clm-enfr-1024": {
+        "do_lowercase_and_remove_accent": True,
+        "id2lang": {"0": "en", "1": "fr"},
+        "lang2id": {"en": 0, "fr": 1},
+    },
+    "xlm-clm-ende-1024": {
+        "do_lowercase_and_remove_accent": True,
+        "id2lang": {"0": "de", "1": "en"},
+        "lang2id": {"de": 0, "en": 1},
+    },
+    "xlm-mlm-17-1280": {
+        "do_lowercase_and_remove_accent": False,
+        "id2lang": {
+            "0": "ar",
+            "1": "de",
+            "2": "en",
+            "3": "es",
+            "4": "fr",
+            "5": "hi",
+            "6": "it",
+            "7": "ja",
+            "8": "ko",
+            "9": "nl",
+            "10": "pl",
+            "11": "pt",
+            "12": "ru",
+            "13": "sv",
+            "14": "tr",
+            "15": "vi",
+            "16": "zh",
+        },
+        "lang2id": {
+            "ar": 0,
+            "de": 1,
+            "en": 2,
+            "es": 3,
+            "fr": 4,
+            "hi": 5,
+            "it": 6,
+            "ja": 7,
+            "ko": 8,
+            "nl": 9,
+            "pl": 10,
+            "pt": 11,
+            "ru": 12,
+            "sv": 13,
+            "tr": 14,
+            "vi": 15,
+            "zh": 16,
+        },
+    },
+    "xlm-mlm-100-1280": {
+        "do_lowercase_and_remove_accent": False,
+        "id2lang": {
+            "0": "af",
+            "1": "als",
+            "2": "am",
+            "3": "an",
+            "4": "ang",
+            "5": "ar",
+            "6": "arz",
+            "7": "ast",
+            "8": "az",
+            "9": "bar",
+            "10": "be",
+            "11": "bg",
+            "12": "bn",
+            "13": "br",
+            "14": "bs",
+            "15": "ca",
+            "16": "ceb",
+            "17": "ckb",
+            "18": "cs",
+            "19": "cy",
+            "20": "da",
+            "21": "de",
+            "22": "el",
+            "23": "en",
+            "24": "eo",
+            "25": "es",
+            "26": "et",
+            "27": "eu",
+            "28": "fa",
+            "29": "fi",
+            "30": "fr",
+            "31": "fy",
+            "32": "ga",
+            "33": "gan",
+            "34": "gl",
+            "35": "gu",
+            "36": "he",
+            "37": "hi",
+            "38": "hr",
+            "39": "hu",
+            "40": "hy",
+            "41": "ia",
+            "42": "id",
+            "43": "is",
+            "44": "it",
+            "45": "ja",
+            "46": "jv",
+            "47": "ka",
+            "48": "kk",
+            "49": "kn",
+            "50": "ko",
+            "51": "ku",
+            "52": "la",
+            "53": "lb",
+            "54": "lt",
+            "55": "lv",
+            "56": "mk",
+            "57": "ml",
+            "58": "mn",
+            "59": "mr",
+            "60": "ms",
+            "61": "my",
+            "62": "nds",
+            "63": "ne",
+            "64": "nl",
+            "65": "nn",
+            "66": "no",
+            "67": "oc",
+            "68": "pl",
+            "69": "pt",
+            "70": "ro",
+            "71": "ru",
+            "72": "scn",
+            "73": "sco",
+            "74": "sh",
+            "75": "si",
+            "76": "simple",
+            "77": "sk",
+            "78": "sl",
+            "79": "sq",
+            "80": "sr",
+            "81": "sv",
+            "82": "sw",
+            "83": "ta",
+            "84": "te",
+            "85": "th",
+            "86": "tl",
+            "87": "tr",
+            "88": "tt",
+            "89": "uk",
+            "90": "ur",
+            "91": "uz",
+            "92": "vi",
+            "93": "war",
+            "94": "wuu",
+            "95": "yi",
+            "96": "zh",
+            "97": "zh_classical",
+            "98": "zh_min_nan",
+            "99": "zh_yue",
+        },
+        "lang2id": {
+            "af": 0,
+            "als": 1,
+            "am": 2,
+            "an": 3,
+            "ang": 4,
+            "ar": 5,
+            "arz": 6,
+            "ast": 7,
+            "az": 8,
+            "bar": 9,
+            "be": 10,
+            "bg": 11,
+            "bn": 12,
+            "br": 13,
+            "bs": 14,
+            "ca": 15,
+            "ceb": 16,
+            "ckb": 17,
+            "cs": 18,
+            "cy": 19,
+            "da": 20,
+            "de": 21,
+            "el": 22,
+            "en": 23,
+            "eo": 24,
+            "es": 25,
+            "et": 26,
+            "eu": 27,
+            "fa": 28,
+            "fi": 29,
+            "fr": 30,
+            "fy": 31,
+            "ga": 32,
+            "gan": 33,
+            "gl": 34,
+            "gu": 35,
+            "he": 36,
+            "hi": 37,
+            "hr": 38,
+            "hu": 39,
+            "hy": 40,
+            "ia": 41,
+            "id": 42,
+            "is": 43,
+            "it": 44,
+            "ja": 45,
+            "jv": 46,
+            "ka": 47,
+            "kk": 48,
+            "kn": 49,
+            "ko": 50,
+            "ku": 51,
+            "la": 52,
+            "lb": 53,
+            "lt": 54,
+            "lv": 55,
+            "mk": 56,
+            "ml": 57,
+            "mn": 58,
+            "mr": 59,
+            "ms": 60,
+            "my": 61,
+            "nds": 62,
+            "ne": 63,
+            "nl": 64,
+            "nn": 65,
+            "no": 66,
+            "oc": 67,
+            "pl": 68,
+            "pt": 69,
+            "ro": 70,
+            "ru": 71,
+            "scn": 72,
+            "sco": 73,
+            "sh": 74,
+            "si": 75,
+            "simple": 76,
+            "sk": 77,
+            "sl": 78,
+            "sq": 79,
+            "sr": 80,
+            "sv": 81,
+            "sw": 82,
+            "ta": 83,
+            "te": 84,
+            "th": 85,
+            "tl": 86,
+            "tr": 87,
+            "tt": 88,
+            "uk": 89,
+            "ur": 90,
+            "uz": 91,
+            "vi": 92,
+            "war": 93,
+            "wuu": 94,
+            "yi": 95,
+            "zh": 96,
+            "zh_classical": 97,
+            "zh_min_nan": 98,
+            "zh_yue": 99,
+        },
+    },
 }
 
+
 def get_pairs(word):
     """
     Return set of symbol pairs in a word.
@@ -430,7 +445,7 @@ def lowercase_and_remove_accent(text):
     Lowercase and strips accents from a piece of text based on
     https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py
     """
-    text = ' '.join(text)
+    text = " ".join(text)
     text = text.lower()
     text = unicodedata.normalize("NFD", text)
     output = []
@@ -439,73 +454,73 @@ def lowercase_and_remove_accent(text):
         if cat == "Mn":
             continue
         output.append(char)
-    return "".join(output).lower().split(' ')
+    return "".join(output).lower().split(" ")
 
 
 def replace_unicode_punct(text):
-    '''
+    """
     Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
-    '''
-    text = text.replace('，', ',')
-    text = re.sub(r'。\s*', '. ', text)
-    text = text.replace('、', ',')
-    text = text.replace('”', '"')
-    text = text.replace('“', '"')
-    text = text.replace('∶', ':')
-    text = text.replace('：', ':')
-    text = text.replace('？', '?')
-    text = text.replace('《', '"')
-    text = text.replace('》', '"')
-    text = text.replace('）', ')')
-    text = text.replace('！', '!')
-    text = text.replace('（', '(')
-    text = text.replace('；', ';')
-    text = text.replace('１', '"')
-    text = text.replace('」', '"')
-    text = text.replace('「', '"')
-    text = text.replace('０', '0')
-    text = text.replace('３', '3')
-    text = text.replace('２', '2')
-    text = text.replace('５', '5')
-    text = text.replace('６', '6')
-    text = text.replace('９', '9')
-    text = text.replace('７', '7')
-    text = text.replace('８', '8')
-    text = text.replace('４', '4')
-    text = re.sub(r'．\s*', '. ', text)
-    text = text.replace('～', '~')
-    text = text.replace('’', '\'')
-    text = text.replace('…', '...')
-    text = text.replace('━', '-')
-    text = text.replace('〈', '<')
-    text = text.replace('〉', '>')
-    text = text.replace('【', '[')
-    text = text.replace('】', ']')
-    text = text.replace('％', '%')
+    """
+    text = text.replace("，", ",")
+    text = re.sub(r"。\s*", ". ", text)
+    text = text.replace("、", ",")
+    text = text.replace("”", '"')
+    text = text.replace("“", '"')
+    text = text.replace("∶", ":")
+    text = text.replace("：", ":")
+    text = text.replace("？", "?")
+    text = text.replace("《", '"')
+    text = text.replace("》", '"')
+    text = text.replace("）", ")")
+    text = text.replace("！", "!")
+    text = text.replace("（", "(")
+    text = text.replace("；", ";")
+    text = text.replace("１", '"')
+    text = text.replace("」", '"')
+    text = text.replace("「", '"')
+    text = text.replace("０", "0")
+    text = text.replace("３", "3")
+    text = text.replace("２", "2")
+    text = text.replace("５", "5")
+    text = text.replace("６", "6")
+    text = text.replace("９", "9")
+    text = text.replace("７", "7")
+    text = text.replace("８", "8")
+    text = text.replace("４", "4")
+    text = re.sub(r"．\s*", ". ", text)
+    text = text.replace("～", "~")
+    text = text.replace("’", "'")
+    text = text.replace("…", "...")
+    text = text.replace("━", "-")
+    text = text.replace("〈", "<")
+    text = text.replace("〉", ">")
+    text = text.replace("【", "[")
+    text = text.replace("】", "]")
+    text = text.replace("％", "%")
     return text
 
 
 def remove_non_printing_char(text):
-    '''
+    """
     Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
-    '''
+    """
     output = []
     for char in text:
         cat = unicodedata.category(char)
-        if cat.startswith('C'):
+        if cat.startswith("C"):
             continue
         output.append(char)
     return "".join(output)
 
 
 def romanian_preprocessing(text):
-    '''Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`'''
+    """Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`"""
     # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/normalise-romanian.py
     text = text.replace("\u015e", "\u0218").replace("\u015f", "\u0219")
     text = text.replace("\u0162", "\u021a").replace("\u0163", "\u021b")
     # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/remove-diacritics.py
-    text = text.replace("\u0218", "S").replace("\u0219", "s") #s-comma
-    text = text.replace("\u021a", "T").replace("\u021b", "t") #t-comma
+    text = text.replace("\u0218", "S").replace("\u0219", "s")  # s-comma
+    text = text.replace("\u021a", "T").replace("\u021b", "t")  # t-comma
     text = text.replace("\u0102", "A").replace("\u0103", "a")
     text = text.replace("\u00C2", "A").replace("\u00E2", "a")
     text = text.replace("\u00CE", "I").replace("\u00EE", "i")
@@ -531,33 +546,58 @@ class XLMTokenizer(PreTrainedTokenizer):
 
         - `do_lowercase_and_remove_accent` controle lower casing and accent (automatically set for pretrained vocabularies)
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
-                 sep_token="</s>", pad_token="<pad>", cls_token="</s>",
-                 mask_token="<special1>", additional_special_tokens=["<special0>",
-                 "<special1>", "<special2>", "<special3>", "<special4>", "<special5>",
-                 "<special6>", "<special7>", "<special8>", "<special9>"],
-                 lang2id=None, id2lang=None, do_lowercase_and_remove_accent=True,
-                 **kwargs):
-        super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token,
-                                           sep_token=sep_token, pad_token=pad_token,
-                                           cls_token=cls_token, mask_token=mask_token,
-                                           additional_special_tokens=additional_special_tokens,
-                                           **kwargs)
-
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        sep_token="</s>",
+        pad_token="<pad>",
+        cls_token="</s>",
+        mask_token="<special1>",
+        additional_special_tokens=[
+            "<special0>",
+            "<special1>",
+            "<special2>",
+            "<special3>",
+            "<special4>",
+            "<special5>",
+            "<special6>",
+            "<special7>",
+            "<special8>",
+            "<special9>",
+        ],
+        lang2id=None,
+        id2lang=None,
+        do_lowercase_and_remove_accent=True,
+        **kwargs
+    ):
+        super(XLMTokenizer, self).__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs
+        )
 
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
-        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens 
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
 
         # cache of sm.MosesPunctNormalizer instance
         self.cache_moses_punct_normalizer = dict()
         # cache of sm.MosesTokenizer instance
         self.cache_moses_tokenizer = dict()
-        self.lang_with_custom_tokenizer = set(['zh', 'th', 'ja'])
+        self.lang_with_custom_tokenizer = set(["zh", "th", "ja"])
         # True for current supported model (v1.2.0), False for XLM-17 & 100
         self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent
         self.lang2id = lang2id
@@ -570,9 +610,9 @@ class XLMTokenizer(PreTrainedTokenizer):
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
-        self.decoder = {v:k for k,v in self.encoder.items()}
-        with open(merges_file, encoding='utf-8') as merges_handle:
-            merges = merges_handle.read().split('\n')[:-1]
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            merges = merges_handle.read().split("\n")[:-1]
         merges = [tuple(merge.split()[:2]) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
@@ -603,9 +643,14 @@ class XLMTokenizer(PreTrainedTokenizer):
         if self.ja_word_tokenizer is None:
             try:
                 import Mykytea
-                self.ja_word_tokenizer = Mykytea.Mykytea('-model %s/local/share/kytea/model.bin' % os.path.expanduser('~'))
+
+                self.ja_word_tokenizer = Mykytea.Mykytea(
+                    "-model %s/local/share/kytea/model.bin" % os.path.expanduser("~")
+                )
             except (AttributeError, ImportError) as e:
-                logger.error("Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps")
+                logger.error(
+                    "Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps"
+                )
                 logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
                 logger.error("2. autoreconf -i")
                 logger.error("3. ./configure --prefix=$HOME/local")
@@ -619,16 +664,16 @@ class XLMTokenizer(PreTrainedTokenizer):
         return len(self.encoder)
 
     def bpe(self, token):
-        word = tuple(token[:-1]) + (token[-1] + '</w>',)
+        word = tuple(token[:-1]) + (token[-1] + "</w>",)
         if token in self.cache:
             return self.cache[token]
         pairs = get_pairs(word)
 
         if not pairs:
-            return token+'</w>'
+            return token + "</w>"
 
         while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
             if bigram not in self.bpe_ranks:
                 break
             first, second = bigram
@@ -643,8 +688,8 @@ class XLMTokenizer(PreTrainedTokenizer):
                     new_word.extend(word[i:])
                     break
 
-                if word[i] == first and i < len(word)-1 and word[i+1] == second:
-                    new_word.append(first+second)
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
                     i += 2
                 else:
                     new_word.append(word[i])
@@ -655,13 +700,13 @@ class XLMTokenizer(PreTrainedTokenizer):
                 break
             else:
                 pairs = get_pairs(word)
-        word = ' '.join(word)
-        if word == '\n  </w>':
-            word = '\n</w>'
+        word = " ".join(word)
+        if word == "\n  </w>":
+            word = "\n</w>"
         self.cache[token] = word
         return word
 
-    def _tokenize(self, text, lang='en', bypass_tokenizer=False):
+    def _tokenize(self, text, lang="en", bypass_tokenizer=False):
         """
         Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizerself. Otherwise, we use Moses.
 
@@ -697,45 +742,49 @@ class XLMTokenizer(PreTrainedTokenizer):
             List of tokens.
         """
         if lang and self.lang2id and lang not in self.lang2id:
-            logger.error("Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model.")
+            logger.error(
+                "Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model."
+            )
         if bypass_tokenizer:
             text = text.split()
         elif lang not in self.lang_with_custom_tokenizer:
             text = self.moses_pipeline(text, lang=lang)
             # TODO: make sure we are using `xlm-mlm-enro-1024`, since XLM-100 doesn't have this step
-            if lang == 'ro':
+            if lang == "ro":
                 text = romanian_preprocessing(text)
             text = self.moses_tokenize(text, lang=lang)
-        elif lang == 'th':
+        elif lang == "th":
             text = self.moses_pipeline(text, lang=lang)
             try:
-                if 'pythainlp' not in sys.modules:
+                if "pythainlp" not in sys.modules:
                     from pythainlp.tokenize import word_tokenize as th_word_tokenize
                 else:
-                    th_word_tokenize = sys.modules['pythainlp'].word_tokenize
+                    th_word_tokenize = sys.modules["pythainlp"].word_tokenize
             except (AttributeError, ImportError) as e:
-                logger.error("Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps")
+                logger.error(
+                    "Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps"
+                )
                 logger.error("1. pip install pythainlp")
                 raise e
             text = th_word_tokenize(text)
-        elif lang == 'zh':
+        elif lang == "zh":
             try:
-                if 'jieba' not in sys.modules:
+                if "jieba" not in sys.modules:
                     import jieba
                 else:
-                    jieba = sys.modules['jieba']
+                    jieba = sys.modules["jieba"]
             except (AttributeError, ImportError) as e:
                 logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps")
                 logger.error("1. pip install jieba")
                 raise e
-            text = ' '.join(jieba.cut(text))
+            text = " ".join(jieba.cut(text))
             text = self.moses_pipeline(text, lang=lang)
             text = text.split()
-        elif lang == 'ja':
+        elif lang == "ja":
             text = self.moses_pipeline(text, lang=lang)
             text = self.ja_tokenize(text)
         else:
-            raise ValueError('It should not reach here')
+            raise ValueError("It should not reach here")
 
         if self.do_lowercase_and_remove_accent and not bypass_tokenizer:
             text = lowercase_and_remove_accent(text)
@@ -743,7 +792,7 @@ class XLMTokenizer(PreTrainedTokenizer):
         split_tokens = []
         for token in text:
             if token:
-                split_tokens.extend([t for t in self.bpe(token).split(' ')])
+                split_tokens.extend([t for t in self.bpe(token).split(" ")])
 
         return split_tokens
 
@@ -757,7 +806,7 @@ class XLMTokenizer(PreTrainedTokenizer):
 
     def convert_tokens_to_string(self, tokens):
         """ Converts a sequence of tokens (string) in a single string. """
-        out_string = ''.join(tokens).replace('</w>', ' ').strip()
+        out_string = "".join(tokens).replace("</w>", " ").strip()
         return out_string
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
@@ -792,8 +841,10 @@ class XLMTokenizer(PreTrainedTokenizer):
 
         if already_has_special_tokens:
             if token_ids_1 is not None:
-                raise ValueError("You should not supply a second sequence if the provided sequence of "
-                                 "ids is already formated with special tokens for the model.")
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
         if token_ids_1 is not None:
@@ -820,20 +871,22 @@ class XLMTokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
-        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
 
-        with open(vocab_file, 'w', encoding='utf-8') as f:
+        with open(vocab_file, "w", encoding="utf-8") as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
 
         index = 0
         with open(merge_file, "w", encoding="utf-8") as writer:
             for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                 if index != token_index:
-                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    logger.warning(
+                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!".format(merge_file)
+                    )
                     index = token_index
-                writer.write(' '.join(bpe_tokens) + u'\n')
+                writer.write(" ".join(bpe_tokens) + "\n")
                 index += 1
 
         return vocab_file, merge_file
diff --git a/transformers/tokenization_xlm_roberta.py b/transformers/tokenization_xlm_roberta.py
index adbc8cd6c7..30814c3a1d 100644
--- a/transformers/tokenization_xlm_roberta.py
+++ b/transformers/tokenization_xlm_roberta.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 """ Tokenization classes for XLM-RoBERTa model."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 import os
@@ -26,29 +25,29 @@ from .tokenization_xlnet import SPIECE_UNDERLINE
 
 logger = logging.getLogger(__name__)
 
-VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'}
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-    'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-sentencepiece.bpe.model",
-    'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-sentencepiece.bpe.model",
-    'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-sentencepiece.bpe.model",
-    'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-sentencepiece.bpe.model",
-    'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-sentencepiece.bpe.model",
-    'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-sentencepiece.bpe.model",
+    "vocab_file": {
+        "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-sentencepiece.bpe.model",
+        "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-sentencepiece.bpe.model",
+        "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-sentencepiece.bpe.model",
+        "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-sentencepiece.bpe.model",
+        "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-sentencepiece.bpe.model",
+        "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-sentencepiece.bpe.model",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'xlm-roberta-base': 512,
-    'xlm-roberta-large': 512,
-    'xlm-roberta-large-finetuned-conll02-dutch': 512,
-    'xlm-roberta-large-finetuned-conll02-spanish': 512,
-    'xlm-roberta-large-finetuned-conll03-english': 512,
-    'xlm-roberta-large-finetuned-conll03-german': 512,
+    "xlm-roberta-base": 512,
+    "xlm-roberta-large": 512,
+    "xlm-roberta-large-finetuned-conll02-dutch": 512,
+    "xlm-roberta-large-finetuned-conll02-spanish": 512,
+    "xlm-roberta-large-finetuned-conll03-english": 512,
+    "xlm-roberta-large-finetuned-conll03-german": 512,
 }
 
+
 class XLMRobertaTokenizer(PreTrainedTokenizer):
     """
         Adapted from RobertaTokenizer and XLNetTokenizer
@@ -56,17 +55,33 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
 
             - requires `SentencePiece <https://github.com/google/sentencepiece>`_
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, bos_token="<s>", eos_token="</s>", sep_token="</s>",
-                 cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>',
-                 **kwargs):
-        super(XLMRobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
-                                                  sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
-                                                  mask_token=mask_token,
-                                                  **kwargs)
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        **kwargs
+    ):
+        super(XLMRobertaTokenizer, self).__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            **kwargs
+        )
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
         self.sp_model = spm.SentencePieceProcessor()
@@ -85,7 +100,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
         # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
         self.fairseq_offset = 1
 
-        self.fairseq_tokens_to_ids['<mask>'] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
+        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
         self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
@@ -119,8 +134,10 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
         """
         if already_has_special_tokens:
             if token_ids_1 is not None:
-                raise ValueError("You should not supply a second sequence if the provided sequence of "
-                                 "ids is already formated with special tokens for the model.")
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
         if token_ids_1 is None:
@@ -164,7 +181,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
 
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
         return out_string
 
     def save_vocabulary(self, save_directory):
@@ -174,7 +191,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
 
         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py
index a8369df67b..8ea0ccb77b 100644
--- a/transformers/tokenization_xlnet.py
+++ b/transformers/tokenization_xlnet.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Tokenization classes for XLNet model."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 import os
@@ -27,51 +26,69 @@ from .tokenization_utils import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
 
-VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model",
-    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model",
+    "vocab_file": {
+        "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model",
+        "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'xlnet-base-cased': None,
-    'xlnet-large-cased': None,
+    "xlnet-base-cased": None,
+    "xlnet-large-cased": None,
 }
 
-SPIECE_UNDERLINE = u'▁'
+SPIECE_UNDERLINE = "▁"
 
 # Segments (not really needed)
-SEG_ID_A   = 0
-SEG_ID_B   = 1
+SEG_ID_A = 0
+SEG_ID_B = 1
 SEG_ID_CLS = 2
 SEG_ID_SEP = 3
 SEG_ID_PAD = 4
 
+
 class XLNetTokenizer(PreTrainedTokenizer):
     """
         SentencePiece based tokenizer. Peculiarities:
 
             - requires `SentencePiece <https://github.com/google/sentencepiece>`_
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
     padding_side = "left"
 
-    def __init__(self, vocab_file,
-                 do_lower_case=False, remove_space=True, keep_accents=False,
-                 bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>",
-                 pad_token="<pad>", cls_token="<cls>", mask_token="<mask>",
-                 additional_special_tokens=["<eop>", "<eod>"], **kwargs):
-        super(XLNetTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
-                                             unk_token=unk_token, sep_token=sep_token,
-                                             pad_token=pad_token, cls_token=cls_token,
-                                             mask_token=mask_token, additional_special_tokens=
-                                             additional_special_tokens, **kwargs)
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=False,
+        remove_space=True,
+        keep_accents=False,
+        bos_token="<s>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        sep_token="<sep>",
+        pad_token="<pad>",
+        cls_token="<cls>",
+        mask_token="<mask>",
+        additional_special_tokens=["<eop>", "<eod>"],
+        **kwargs
+    ):
+        super(XLNetTokenizer, self).__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs
+        )
 
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
@@ -80,8 +97,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
         try:
             import sentencepiece as spm
         except ImportError:
-            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
-                           "pip install sentencepiece")
+            logger.warning(
+                "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
+                "pip install sentencepiece"
+            )
 
         self.do_lower_case = do_lower_case
         self.remove_space = remove_space
@@ -105,24 +124,26 @@ class XLNetTokenizer(PreTrainedTokenizer):
         try:
             import sentencepiece as spm
         except ImportError:
-            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
-                           "pip install sentencepiece")
+            logger.warning(
+                "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
+                "pip install sentencepiece"
+            )
         self.sp_model = spm.SentencePieceProcessor()
         self.sp_model.Load(self.vocab_file)
 
     def preprocess_text(self, inputs):
         if self.remove_space:
-            outputs = ' '.join(inputs.strip().split())
+            outputs = " ".join(inputs.strip().split())
         else:
             outputs = inputs
         outputs = outputs.replace("``", '"').replace("''", '"')
 
         if six.PY2 and isinstance(outputs, str):
-            outputs = outputs.decode('utf-8')
+            outputs = outputs.decode("utf-8")
 
         if not self.keep_accents:
-            outputs = unicodedata.normalize('NFKD', outputs)
-            outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
+            outputs = unicodedata.normalize("NFKD", outputs)
+            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
         if self.do_lower_case:
             outputs = outputs.lower()
 
@@ -135,7 +156,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         text = self.preprocess_text(text)
         # note(zhiliny): in some systems, sentencepiece only accepts str for py2
         if six.PY2 and isinstance(text, unicode):
-            text = text.encode('utf-8')
+            text = text.encode("utf-8")
 
         if not sample:
             pieces = self.sp_model.EncodeAsPieces(text)
@@ -143,9 +164,8 @@ class XLNetTokenizer(PreTrainedTokenizer):
             pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
         new_pieces = []
         for piece in pieces:
-            if len(piece) > 1 and piece[-1] == str(',') and piece[-2].isdigit():
-                cur_pieces = self.sp_model.EncodeAsPieces(
-                    piece[:-1].replace(SPIECE_UNDERLINE, ''))
+            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
+                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
                 if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                     if len(cur_pieces[0]) == 1:
                         cur_pieces = cur_pieces[1:]
@@ -161,7 +181,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
             ret_pieces = []
             for piece in new_pieces:
                 if isinstance(piece, str):
-                    piece = piece.decode('utf-8')
+                    piece = piece.decode("utf-8")
                 ret_pieces.append(piece)
             new_pieces = ret_pieces
 
@@ -175,12 +195,12 @@ class XLNetTokenizer(PreTrainedTokenizer):
         """Converts an index (integer) in a token (string/unicode) using the vocab."""
         token = self.sp_model.IdToPiece(index)
         if six.PY2 and return_unicode and isinstance(token, str):
-            token = token.decode('utf-8')
+            token = token.decode("utf-8")
         return token
 
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
         return out_string
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
@@ -215,8 +235,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
 
         if already_has_special_tokens:
             if token_ids_1 is not None:
-                raise ValueError("You should not supply a second sequence if the provided sequence of "
-                                 "ids is already formated with special tokens for the model.")
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
         if token_ids_1 is not None:
@@ -247,7 +269,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
 
         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
diff --git a/utils/download_glue_data.py b/utils/download_glue_data.py
index de8cfa9e73..7262dd7201 100644
--- a/utils/download_glue_data.py
+++ b/utils/download_glue_data.py
@@ -1,4 +1,4 @@
-''' Script for downloading all GLUE data.
+""" Script for downloading all GLUE data.
 Original source: https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e
 
 Note: for legal reasons, we are unable to host MRPC.
@@ -16,7 +16,7 @@ rm MSRParaphraseCorpus.msi
 
 1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
 2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
-'''
+"""
 
 import os
 import sys
@@ -27,20 +27,23 @@ import urllib.request
 import zipfile
 
 TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
-TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4',
-             "SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
-             "MRPC":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc',
-             "QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5',
-             "STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5',
-             "MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce',
-             "SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df',
-             "QNLI": 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601',
-             "RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb',
-             "WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf',
-             "diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'}
+TASK2PATH = {
+    "CoLA": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4",
+    "SST": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8",
+    "MRPC": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc",
+    "QQP": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5",
+    "STS": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5",
+    "MNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce",
+    "SNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df",
+    "QNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601",
+    "RTE": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb",
+    "WNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf",
+    "diagnostic": "https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D",
+}
+
+MRPC_TRAIN = "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt"
+MRPC_TEST = "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt"
 
-MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
-MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'
 
 def download_and_extract(task, data_dir):
     print("Downloading and extracting %s..." % task)
@@ -51,6 +54,7 @@ def download_and_extract(task, data_dir):
     os.remove(data_file)
     print("\tCompleted!")
 
+
 def format_mrpc(data_dir, path_to_data):
     print("Processing MRPC...")
     mrpc_dir = os.path.join(data_dir, "MRPC")
@@ -72,30 +76,32 @@ def format_mrpc(data_dir, path_to_data):
     dev_ids = []
     with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh:
         for row in ids_fh:
-            dev_ids.append(row.strip().split('\t'))
+            dev_ids.append(row.strip().split("\t"))
 
-    with open(mrpc_train_file, encoding="utf8") as data_fh, \
-         open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding="utf8") as train_fh, \
-         open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding="utf8") as dev_fh:
+    with open(mrpc_train_file, encoding="utf8") as data_fh, open(
+        os.path.join(mrpc_dir, "train.tsv"), "w", encoding="utf8"
+    ) as train_fh, open(os.path.join(mrpc_dir, "dev.tsv"), "w", encoding="utf8") as dev_fh:
         header = data_fh.readline()
         train_fh.write(header)
         dev_fh.write(header)
         for row in data_fh:
-            label, id1, id2, s1, s2 = row.strip().split('\t')
+            label, id1, id2, s1, s2 = row.strip().split("\t")
             if [id1, id2] in dev_ids:
                 dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
             else:
                 train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
 
-    with open(mrpc_test_file, encoding="utf8") as data_fh, \
-            open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding="utf8") as test_fh:
+    with open(mrpc_test_file, encoding="utf8") as data_fh, open(
+        os.path.join(mrpc_dir, "test.tsv"), "w", encoding="utf8"
+    ) as test_fh:
         header = data_fh.readline()
         test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
         for idx, row in enumerate(data_fh):
-            label, id1, id2, s1, s2 = row.strip().split('\t')
+            label, id1, id2, s1, s2 = row.strip().split("\t")
             test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
     print("\tCompleted!")
 
+
 def download_diagnostic(data_dir):
     print("Downloading and extracting diagnostic...")
     if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
@@ -105,8 +111,9 @@ def download_diagnostic(data_dir):
     print("\tCompleted!")
     return
 
+
 def get_tasks(task_names):
-    task_names = task_names.split(',')
+    task_names = task_names.split(",")
     if "all" in task_names:
         tasks = TASKS
     else:
@@ -116,13 +123,19 @@ def get_tasks(task_names):
             tasks.append(task_name)
     return tasks
 
+
 def main(arguments):
     parser = argparse.ArgumentParser()
-    parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')
-    parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',
-                        type=str, default='all')
-    parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
-                        type=str, default='')
+    parser.add_argument("--data_dir", help="directory to save data to", type=str, default="glue_data")
+    parser.add_argument(
+        "--tasks", help="tasks to download data for as a comma separated string", type=str, default="all"
+    )
+    parser.add_argument(
+        "--path_to_mrpc",
+        help="path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt",
+        type=str,
+        default="",
+    )
     args = parser.parse_args(arguments)
 
     if not os.path.isdir(args.data_dir):
@@ -130,13 +143,13 @@ def main(arguments):
     tasks = get_tasks(args.tasks)
 
     for task in tasks:
-        if task == 'MRPC':
+        if task == "MRPC":
             format_mrpc(args.data_dir, args.path_to_mrpc)
-        elif task == 'diagnostic':
+        elif task == "diagnostic":
             download_diagnostic(args.data_dir)
         else:
             download_and_extract(task, args.data_dir)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     sys.exit(main(sys.argv[1:]))
diff --git a/utils/link_tester.py b/utils/link_tester.py
index fe3990d28c..0ef165c401 100644
--- a/utils/link_tester.py
+++ b/utils/link_tester.py
@@ -43,7 +43,7 @@ def scan_code_for_links(source):
     """ Scans the file to find links using a regular expression.
     Returns a list of links.
     """
-    with open(source, 'r') as content:
+    with open(source, "r") as content:
         content = content.read()
         raw_links = re.findall(REGEXP_FIND_S3_LINKS, content)
         links = [prefix + suffix for _, prefix, suffix in raw_links]