Reformat source code with black.

This is the result of: $ black --line-length 119 examples templates transformers utils hubconf.py setup.py There's a lot of fairly long lines in the project. As a consequence, I'm picking the longest widely accepted line length, 119 characters. This is also Thomas' preference, because it allows for explicit variable names, to make the code easier to understand.
2019-12-21 15:46:46 +01:00
parent 63e3827c6b
commit fa84ae26d6
200 changed files with 17452 additions and 12594 deletions
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -247,16 +247,18 @@ the wall, slowly on into the Social Predestination Room.
 as they entered."""


-def create_setup_and_compute(model_names: List[str],
-                             gpu: bool = True,
-                             tensorflow: bool = False,
-                             average_over: int = 3,
-                             torchscript: bool = False,
-                             xla: bool = False,
-                             amp: bool = False,
-                             fp16: bool = False,
-                             save_to_csv: bool = False,
-                             csv_filename: str = f"results_{round(time())}.csv"):
+def create_setup_and_compute(
+    model_names: List[str],
+    gpu: bool = True,
+    tensorflow: bool = False,
+    average_over: int = 3,
+    torchscript: bool = False,
+    xla: bool = False,
+    amp: bool = False,
+    fp16: bool = False,
+    save_to_csv: bool = False,
+    csv_filename: str = f"results_{round(time())}.csv",
+):
    if xla:
        tf.config.optimizer.set_jit(True)
    if amp:
@@ -266,7 +268,7 @@ def create_setup_and_compute(model_names: List[str],
        dictionary = {model_name: {} for model_name in model_names}
        results = _compute_tensorflow(model_names, dictionary, average_over, amp)
    else:
-        device = 'cuda' if (gpu and torch.cuda.is_available()) else 'cpu'
+        device = "cuda" if (gpu and torch.cuda.is_available()) else "cpu"
        dictionary = {model_name: {} for model_name in model_names}
        results = _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16)

@@ -276,34 +278,52 @@ def create_setup_and_compute(model_names: List[str],
        for batch_size in results[model_name]["bs"]:
            print("\t\t" + f"===== BATCH SIZE: {batch_size} =====")
            for slice_size in results[model_name]["ss"]:
-                result = results[model_name]['results'][batch_size][slice_size]
+                result = results[model_name]["results"][batch_size][slice_size]
                if isinstance(result, str):
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: "
-                          f"{result}")
+                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{result}")
                else:
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: "
-                          f"{(round(1000 * result) / 1000)}"
-                          f"s")
+                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{(round(1000 * result) / 1000)}" f"s")

    if save_to_csv:
-        with open(csv_filename, mode='w') as csv_file:
-            fieldnames = ['model',
-                          '1x8', '1x64', '1x128', '1x256', '1x512', '1x1024',
-                          '2x8', '2x64', '2x128', '2x256', '2x512', '2x1024',
-                          '4x8', '4x64', '4x128', '4x256', '4x512', '4x1024',
-                          '8x8', '8x64', '8x128', '8x256', '8x512', '8x1024',
-                          ]
+        with open(csv_filename, mode="w") as csv_file:
+            fieldnames = [
+                "model",
+                "1x8",
+                "1x64",
+                "1x128",
+                "1x256",
+                "1x512",
+                "1x1024",
+                "2x8",
+                "2x64",
+                "2x128",
+                "2x256",
+                "2x512",
+                "2x1024",
+                "4x8",
+                "4x64",
+                "4x128",
+                "4x256",
+                "4x512",
+                "4x1024",
+                "8x8",
+                "8x64",
+                "8x128",
+                "8x256",
+                "8x512",
+                "8x1024",
+            ]

            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
            writer.writeheader()

            for model_name in model_names:
                model_results = {
-                    f'{bs}x{ss}': results[model_name]['results'][bs][ss]
+                    f"{bs}x{ss}": results[model_name]["results"][bs][ss]
                    for bs in results[model_name]["results"]
-                    for ss in results[model_name]['results'][bs]
+                    for ss in results[model_name]["results"][bs]
                }
-                writer.writerow({'model': model_name, **model_results})
+                writer.writerow({"model": model_name, **model_results})


 def _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16):
@@ -343,7 +363,7 @@ def _compute_pytorch(model_names, dictionary, average_over, device, torchscript,

                        print("Going through model with sequence of shape", sequence.shape)
                        runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
-                        average_time = sum(runtimes)/float(len(runtimes)) / 3.0
+                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                        dictionary[model_name]["results"][batch_size][slice_size] = average_time
                    except RuntimeError as e:
                        print("Doesn't fit on GPU.", e)
@@ -379,7 +399,9 @@ def _compute_tensorflow(model_names, dictionary, average_over, amp):
                if max_input_size is not None and slice_size > max_input_size:
                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
                else:
-                    sequence = tf.stack([tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size)
+                    sequence = tf.stack(
+                        [tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size
+                    )

                    try:
                        print("Going through model with sequence of shape", sequence.shape)
@@ -387,7 +409,7 @@ def _compute_tensorflow(model_names, dictionary, average_over, amp):
                        inference(sequence)

                        runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
-                        average_time = sum(runtimes)/float(len(runtimes)) / 3.0
+                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                        dictionary[model_name]["results"][batch_size][slice_size] = average_time
                    except tf.errors.ResourceExhaustedError as e:
                        print("Doesn't fit on GPU.", e)
@@ -399,33 +421,64 @@ def _compute_tensorflow(model_names, dictionary, average_over, amp):
 def main():
    parser = argparse.ArgumentParser()

-    parser.add_argument("--models", required=False, type=str, default='all', help="Model checkpoints to be provided "
-                                                                                  "to the AutoModel classes. Leave "
-                                                                                  "blank to benchmark the base version "
-                                                                                  "of all available model "
-                                                                                  "architectures.")
-    parser.add_argument("--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the "
-                                                                             "models")
-    parser.add_argument("--torch_cuda", required=False, action="store_true", help="Pytorch only: run on available "
-                                                                                  "cuda devices")
-    parser.add_argument("--torchscript", required=False, action="store_true", help="Pytorch only: trace the models "
-                                                                                   "using torchscript")
-    parser.add_argument("--tensorflow", required=False, action="store_true", help="Benchmark the TensorFlow version "
-                                                                                  "of the models. Will run on GPU if "
-                                                                                  "the correct dependencies are "
-                                                                                  "installed")
+    parser.add_argument(
+        "--models",
+        required=False,
+        type=str,
+        default="all",
+        help="Model checkpoints to be provided "
+        "to the AutoModel classes. Leave "
+        "blank to benchmark the base version "
+        "of all available model "
+        "architectures.",
+    )
+    parser.add_argument(
+        "--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the " "models"
+    )
+    parser.add_argument(
+        "--torch_cuda", required=False, action="store_true", help="Pytorch only: run on available " "cuda devices"
+    )
+    parser.add_argument(
+        "--torchscript",
+        required=False,
+        action="store_true",
+        help="Pytorch only: trace the models " "using torchscript",
+    )
+    parser.add_argument(
+        "--tensorflow",
+        required=False,
+        action="store_true",
+        help="Benchmark the TensorFlow version "
+        "of the models. Will run on GPU if "
+        "the correct dependencies are "
+        "installed",
+    )
    parser.add_argument("--xla", required=False, action="store_true", help="TensorFlow only: use XLA acceleration.")
-    parser.add_argument("--amp", required=False, action="store_true", help="TensorFlow only: use automatic mixed precision acceleration.")
-    parser.add_argument("--fp16", required=False, action="store_true", help="PyTorch only: use FP16 to accelerate inference.")
-    parser.add_argument("--keras_predict", required=False, action="store_true", help="Whether to use model.predict "
-                                                                                     "instead of model() to do a "
-                                                                                     "forward pass.")
+    parser.add_argument(
+        "--amp",
+        required=False,
+        action="store_true",
+        help="TensorFlow only: use automatic mixed precision acceleration.",
+    )
+    parser.add_argument(
+        "--fp16", required=False, action="store_true", help="PyTorch only: use FP16 to accelerate inference."
+    )
+    parser.add_argument(
+        "--keras_predict",
+        required=False,
+        action="store_true",
+        help="Whether to use model.predict " "instead of model() to do a " "forward pass.",
+    )
    parser.add_argument("--save_to_csv", required=False, action="store_true", help="Save to a CSV file.")
-    parser.add_argument("--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv.")
-    parser.add_argument("--average_over", required=False, default=30, type=int, help="Times an experiment will be run.")
+    parser.add_argument(
+        "--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv."
+    )
+    parser.add_argument(
+        "--average_over", required=False, default=30, type=int, help="Times an experiment will be run."
+    )

    args = parser.parse_args()
-    if args.models == 'all':
+    if args.models == "all":
        args.models = [
            "gpt2",
            "bert-base-cased",
@@ -436,7 +489,7 @@ def main():
            "distilbert-base-uncased",
            "distilgpt2",
            "roberta-base",
-            "ctrl"
+            "ctrl",
        ]
    else:
        args.models = args.models.split()
@@ -453,7 +506,7 @@ def main():
                fp16=args.fp16,
                save_to_csv=args.save_to_csv,
                csv_filename=args.csv_filename,
-                average_over=args.average_over
+                average_over=args.average_over,
            )
        else:
            raise ImportError("Trying to run a PyTorch benchmark but PyTorch was not found in the environment.")
@@ -467,11 +520,11 @@ def main():
                amp=args.amp,
                save_to_csv=args.save_to_csv,
                csv_filename=args.csv_filename,
-                average_over=args.average_over
+                average_over=args.average_over,
            )
        else:
            raise ImportError("Trying to run a TensorFlow benchmark but TensorFlow was not found in the environment.")

-if __name__ == '__main__':
-    main()

+if __name__ == "__main__":
+    main()
--- a/examples/contrib/run_camembert.py
+++ b/examples/contrib/run_camembert.py
@@ -10,38 +10,37 @@ from transformers.modeling_camembert import CamembertForMaskedLM

 def fill_mask(masked_input, model, tokenizer, topk=5):
    # Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
-    assert masked_input.count('<mask>') == 1
+    assert masked_input.count("<mask>") == 1
    input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
    logits = model(input_ids)[0]  # The last hidden-state is the first element of the output tuple
    masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
    logits = logits[0, masked_index, :]
    prob = logits.softmax(dim=0)
    values, indices = prob.topk(k=topk, dim=0)
-    topk_predicted_token_bpe = ' '.join([tokenizer.convert_ids_to_tokens(indices[i].item())
-                                         for i in range(len(indices))])
+    topk_predicted_token_bpe = " ".join(
+        [tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))]
+    )
    masked_token = tokenizer.mask_token
    topk_filled_outputs = []
-    for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(' ')):
-        predicted_token = predicted_token_bpe.replace('\u2581', ' ')
+    for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")):
+        predicted_token = predicted_token_bpe.replace("\u2581", " ")
        if " {0}".format(masked_token) in masked_input:
-            topk_filled_outputs.append((
-                masked_input.replace(
-                    ' {0}'.format(masked_token), predicted_token
-                ),
-                values[index].item(),
-                predicted_token,
-            ))
+            topk_filled_outputs.append(
+                (
+                    masked_input.replace(" {0}".format(masked_token), predicted_token),
+                    values[index].item(),
+                    predicted_token,
+                )
+            )
        else:
-            topk_filled_outputs.append((
-                masked_input.replace(masked_token, predicted_token),
-                values[index].item(),
-                predicted_token,
-            ))
+            topk_filled_outputs.append(
+                (masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
+            )
    return topk_filled_outputs


-tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
-model = CamembertForMaskedLM.from_pretrained('camembert-base')
+tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
+model = CamembertForMaskedLM.from_pretrained("camembert-base")
 model.eval()

 masked_input = "Le camembert est <mask> :)"
--- a/examples/contrib/run_openai_gpt.py
+++ b/examples/contrib/run_openai_gpt.py
@@ -36,34 +36,42 @@ from tqdm import tqdm, trange

 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset

-from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
-                                     AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
-                                     get_linear_schedule_with_warmup)
+from transformers import (
+    OpenAIGPTDoubleHeadsModel,
+    OpenAIGPTTokenizer,
+    AdamW,
+    cached_path,
+    WEIGHTS_NAME,
+    CONFIG_NAME,
+    get_linear_schedule_with_warmup,
+)

 ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"

-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+)
 logger = logging.getLogger(__name__)

+
 def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

+
 def load_rocstories_dataset(dataset_path):
    """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
-    with open(dataset_path, encoding='utf_8') as f:
+    with open(dataset_path, encoding="utf_8") as f:
        f = csv.reader(f)
        output = []
-        next(f) # skip the first line
+        next(f)  # skip the first line
        for line in tqdm(f):
-            output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))
+            output.append((" ".join(line[1:5]), line[5], line[6], int(line[-1]) - 1))
    return output

+
 def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
    """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)

@@ -80,56 +88,68 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
        for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
            with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
            with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
-            input_ids[i, 0, :len(with_cont1)] = with_cont1
-            input_ids[i, 1, :len(with_cont2)] = with_cont2
+            input_ids[i, 0, : len(with_cont1)] = with_cont1
+            input_ids[i, 1, : len(with_cont2)] = with_cont2
            mc_token_ids[i, 0] = len(with_cont1) - 1
            mc_token_ids[i, 1] = len(with_cont2) - 1
-            lm_labels[i, 0, :len(with_cont1)] = with_cont1
-            lm_labels[i, 1, :len(with_cont2)] = with_cont2
+            lm_labels[i, 0, : len(with_cont1)] = with_cont1
+            lm_labels[i, 1, : len(with_cont2)] = with_cont2
            mc_labels[i] = mc_label
        all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
        tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
    return tensor_datasets

+
 def main():
    parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name', type=str, default='openai-gpt',
-                        help='pretrained model name')
-    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
-    parser.add_argument('--train_dataset', type=str, default='')
-    parser.add_argument('--eval_dataset', type=str, default='')
-    parser.add_argument('--seed', type=int, default=42)
-    parser.add_argument('--num_train_epochs', type=int, default=3)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--eval_batch_size', type=int, default=16)
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument('--max_grad_norm', type=int, default=1)
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training \
-                        steps to perform. Override num_train_epochs.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before\
-                        performing a backward/update pass.")
-    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
-    parser.add_argument('--weight_decay', type=float, default=0.01)
-    parser.add_argument('--lm_coef', type=float, default=0.9)
-    parser.add_argument('--n_valid', type=int, default=374)
+    parser.add_argument("--model_name", type=str, default="openai-gpt", help="pretrained model name")
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--train_dataset", type=str, default="")
+    parser.add_argument("--eval_dataset", type=str, default="")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--num_train_epochs", type=int, default=3)
+    parser.add_argument("--train_batch_size", type=int, default=8)
+    parser.add_argument("--eval_batch_size", type=int, default=16)
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", type=int, default=1)
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training \
+                        steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before\
+                        performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", type=float, default=6.25e-5)
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+    parser.add_argument("--lr_schedule", type=str, default="warmup_linear")
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--lm_coef", type=float, default=0.9)
+    parser.add_argument("--n_valid", type=int, default=374)

-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
+
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -152,7 +172,7 @@ def main():
    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
-    special_tokens = ['_start_', '_delimiter_', '_classify_']
+    special_tokens = ["_start_", "_delimiter_", "_classify_"]
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
    tokenizer.add_tokens(special_tokens)
    special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens)
@@ -163,6 +183,7 @@ def main():
    # Load and encode the datasets
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)
+
    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
@@ -170,6 +191,7 @@ def main():
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)
+
    logger.info("Encoding dataset...")
    train_dataset = load_rocstories_dataset(args.train_dataset)
    eval_dataset = load_rocstories_dataset(args.eval_dataset)
@@ -178,8 +200,11 @@ def main():

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
-    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
-                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
+    input_length = max(
+        len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3
+        for dataset in encoded_datasets
+        for story, cont1, cont2, _ in dataset
+    )
    input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
@@ -198,20 +223,23 @@ def main():
    if args.do_train:
        if args.max_steps > 0:
            t_total = args.max_steps
-            args.num_train_epochs = args.max_steps //\
-                (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+            args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
        else:
-            t_total = len(train_dataloader)\
-                // args.gradient_accumulation_steps * args.num_train_epochs
+            t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

        param_optimizer = list(model.named_parameters())
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-            ]
+            {
+                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+                "weight_decay": args.weight_decay,
+            },
+            {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+        )

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
@@ -230,14 +258,16 @@ def main():
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
-                exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
+                exp_average_loss = (
+                    loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
+                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, scheduler.get_lr()[0])

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model itself
+        model_to_save = model.module if hasattr(model, "module") else model  # Only save the model itself

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
@@ -260,10 +290,12 @@ def main():
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
-               _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels)
+                _, mc_loss, _, mc_logits = model(
+                    input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels
+                )

            mc_logits = mc_logits.detach().cpu().numpy()
-            mc_labels = mc_labels.to('cpu').numpy()
+            mc_labels = mc_labels.to("cpu").numpy()
            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)

            eval_loss += mc_loss.mean().item()
@@ -274,10 +306,8 @@ def main():

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
-        train_loss = tr_loss/nb_tr_steps if args.do_train else None
-        result = {'eval_loss': eval_loss,
-                  'eval_accuracy': eval_accuracy,
-                  'train_loss': train_loss}
+        train_loss = tr_loss / nb_tr_steps if args.do_train else None
+        result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy, "train_loss": train_loss}

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
@@ -286,5 +316,6 @@ def main():
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

-if __name__ == '__main__':
+
+if __name__ == "__main__":
    main()
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -28,8 +28,7 @@ import glob

 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler

 try:
@@ -39,31 +38,23 @@ except:

 from tqdm import tqdm, trange

-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForMultipleChoice, BertTokenizer)
+from transformers import WEIGHTS_NAME, BertConfig, BertForMultipleChoice, BertTokenizer

 from transformers import AdamW, get_linear_schedule_with_warmup

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
-                  for conf in [BertConfig]), ())
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in [BertConfig]), ())

 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForMultipleChoice, BertTokenizer),
+    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
 }

+
 class SwagExample(object):
    """A single training/test example for the SWAG dataset."""
-    def __init__(self,
-                 swag_id,
-                 context_sentence,
-                 start_ending,
-                 ending_0,
-                 ending_1,
-                 ending_2,
-                 ending_3,
-                 label = None):
+
+    def __init__(self, swag_id, context_sentence, start_ending, ending_0, ending_1, ending_2, ending_3, label=None):
        self.swag_id = swag_id
        self.context_sentence = context_sentence
        self.start_ending = start_ending
@@ -94,57 +85,49 @@ class SwagExample(object):

        return ", ".join(l)

-class InputFeatures(object):
-    def __init__(self,
-                 example_id,
-                 choices_features,
-                 label

-    ):
+class InputFeatures(object):
+    def __init__(self, example_id, choices_features, label):
        self.example_id = example_id
        self.choices_features = [
-            {
-                'input_ids': input_ids,
-                'input_mask': input_mask,
-                'segment_ids': segment_ids
-            }
+            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
            for _, input_ids, input_mask, segment_ids in choices_features
        ]
        self.label = label

+
 def read_swag_examples(input_file, is_training=True):
-    with open(input_file, 'r', encoding='utf-8') as f:
+    with open(input_file, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        lines = []
        for line in reader:
            if sys.version_info[0] == 2:
-                line = list(unicode(cell, 'utf-8') for cell in line)
+                line = list(unicode(cell, "utf-8") for cell in line)
            lines.append(line)

-    if is_training and lines[0][-1] != 'label':
-        raise ValueError(
-            "For training, the input file must contain a label column."
-        )
+    if is_training and lines[0][-1] != "label":
+        raise ValueError("For training, the input file must contain a label column.")

    examples = [
        SwagExample(
-            swag_id = line[2],
-            context_sentence = line[4],
-            start_ending = line[5], # in the swag dataset, the
-                                         # common beginning of each
-                                         # choice is stored in "sent2".
-            ending_0 = line[7],
-            ending_1 = line[8],
-            ending_2 = line[9],
-            ending_3 = line[10],
-            label = int(line[11]) if is_training else None
-        ) for line in lines[1:] # we skip the line with the column names
+            swag_id=line[2],
+            context_sentence=line[4],
+            start_ending=line[5],  # in the swag dataset, the
+            # common beginning of each
+            # choice is stored in "sent2".
+            ending_0=line[7],
+            ending_1=line[8],
+            ending_2=line[9],
+            ending_3=line[10],
+            label=int(line[11]) if is_training else None,
+        )
+        for line in lines[1:]  # we skip the line with the column names
    ]

    return examples

-def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 is_training):
+
+def convert_examples_to_features(examples, tokenizer, max_seq_length, is_training):
    """Loads a data file into a list of `InputBatch`s."""

    # Swag is a multiple choice task. To perform this task using Bert,
@@ -204,23 +187,18 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
            logger.info("swag_id: {}".format(example.swag_id))
            for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
                logger.info("choice: {}".format(choice_idx))
-                logger.info("tokens: {}".format(' '.join(tokens)))
-                logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
-                logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
-                logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
+                logger.info("tokens: {}".format(" ".join(tokens)))
+                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
+                logger.info("input_mask: {}".format(" ".join(map(str, input_mask))))
+                logger.info("segment_ids: {}".format(" ".join(map(str, segment_ids))))
            if is_training:
                logger.info("label: {}".format(label))

-        features.append(
-            InputFeatures(
-                example_id = example.swag_id,
-                choices_features = choices_features,
-                label = label
-            )
-        )
+        features.append(InputFeatures(example_id=example.swag_id, choices_features=choices_features, label=label))

    return features

+
 def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

@@ -237,18 +215,14 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
        else:
            tokens_b.pop()

+
 def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

+
 def select_field(features, field):
-    return [
-        [
-            choice[field]
-            for choice in feature.choices_features
-        ]
-        for feature in features
-    ]
+    return [[choice[field] for choice in feature.choices_features] for feature in features]


 def set_seed(args):
@@ -258,24 +232,28 @@ def set_seed(args):
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

+
 def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
-    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
-        'dev' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length)))
+    cached_features_file = os.path.join(
+        os.path.dirname(input_file),
+        "cached_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+        ),
+    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", input_file)
        examples = read_swag_examples(input_file)
-        features = convert_examples_to_features(
-            examples, tokenizer, args.max_seq_length, not evaluate)
+        features = convert_examples_to_features(examples, tokenizer, args.max_seq_length, not evaluate)

        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@@ -285,21 +263,21 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long)
-    all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long)
-    all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long)
+    all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
+    all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
+    all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
    all_label = torch.tensor([f.label for f in features], dtype=torch.long)

    if evaluate:
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_label)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
    else:
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_label)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)

    if output_examples:
        return dataset, examples, features
    return dataset
+
+
 def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
@@ -316,13 +294,18 @@ def train(args, train_dataset, model, tokenizer):
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
    if args.fp16:
        try:
            from apex import amp
@@ -336,17 +319,21 @@ def train(args, train_dataset, model, tokenizer):

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

@@ -360,11 +347,13 @@ def train(args, train_dataset, model, tokenizer):
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':       batch[0],
-                      'attention_mask':  batch[1],
-                      #'token_type_ids':  None if args.model_type == 'xlm' else batch[2],
-                      'token_type_ids': batch[2],
-                      'labels':         batch[3]}
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                #'token_type_ids':  None if args.model_type == 'xlm' else batch[2],
+                "token_type_ids": batch[2],
+                "labels": batch[3],
+            }
            # if args.model_type in ['xlnet', 'xlm']:
            #     inputs.update({'cls_index': batch[5],
            #                    'p_mask':       batch[6]})
@@ -372,7 +361,7 @@ def train(args, train_dataset, model, tokenizer):
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

@@ -393,23 +382,27 @@ def train(args, train_dataset, model, tokenizer):

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_vocabulary(output_dir)
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
@@ -424,6 +417,7 @@ def train(args, train_dataset, model, tokenizer):

    return global_step, tr_loss / global_step

+
 def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)

@@ -440,7 +434,6 @@ def evaluate(args, model, tokenizer, prefix=""):
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

-
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

@@ -448,11 +441,13 @@ def evaluate(args, model, tokenizer, prefix=""):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      # 'token_type_ids': None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
-                      'token_type_ids': batch[2],
-                      'labels':         batch[3]}
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                # 'token_type_ids': None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+                "token_type_ids": batch[2],
+                "labels": batch[3],
+            }

            # if args.model_type in ['xlnet', 'xlm']:
            #     inputs.update({'cls_index': batch[4],
@@ -462,17 +457,16 @@ def evaluate(args, model, tokenizer, prefix=""):
            eval_loss += tmp_eval_loss.mean().item()

        logits = logits.detach().cpu().numpy()
-        label_ids = inputs['labels'].to('cpu').numpy()
+        label_ids = inputs["labels"].to("cpu").numpy()
        tmp_eval_accuracy = accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy

        nb_eval_steps += 1
-        nb_eval_examples += inputs['input_ids'].size(0)
+        nb_eval_examples += inputs["input_ids"].size(0)

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_examples
-    result = {'eval_loss': eval_loss,
-              'eval_accuracy': eval_accuracy}
+    result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy}

    output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
@@ -483,92 +477,144 @@ def evaluate(args, model, tokenizer, prefix=""):

    return result

+
 def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
-    parser.add_argument("--train_file", default=None, type=str, required=True,
-                        help="SWAG csv for training. E.g., train.csv")
-    parser.add_argument("--predict_file", default=None, type=str, required=True,
-                        help="SWAG csv for predictions. E.g., val.csv or test.csv")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model checkpoints and predictions will be written.")
+    parser.add_argument(
+        "--train_file", default=None, type=str, required=True, help="SWAG csv for training. E.g., train.csv"
+    )
+    parser.add_argument(
+        "--predict_file",
+        default=None,
+        type=str,
+        required=True,
+        help="SWAG csv for predictions. E.g., val.csv or test.csv",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model checkpoints and predictions will be written.",
+    )

    ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--max_seq_length", default=384, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences "
-                             "longer than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=384,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences "
+        "longer than this will be truncated, and sequences shorter than this will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )

-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
    args = parser.parse_args()

-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
+
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -580,16 +626,24 @@ def main():
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )

    # Set seed
    set_seed(args)
@@ -601,8 +655,12 @@ def main():
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
-    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
+    )

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -617,7 +675,6 @@ def main():
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

-
    # Save the trained model and the tokenizer
    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
        # Create output directory if needed
@@ -627,19 +684,20 @@ def main():
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

-
    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
@@ -650,14 +708,16 @@ def main():
            checkpoints = [args.model_name_or_path]

        if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs

        logger.info("Evaluate the following checkpoints: %s", checkpoints)

        for checkpoint in checkpoints:
            # Reload the model
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint)
            tokenizer = tokenizer_class.from_pretrained(checkpoint)
            model.to(args.device)
@@ -665,7 +725,7 @@ def main():
            # Evaluate
            result = evaluate(args, model, tokenizer, prefix=global_step)

-            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
+            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
            results.update(result)

    logger.info("Results: {}".format(results))
--- a/examples/contrib/run_transfo_xl.py
+++ b/examples/contrib/run_transfo_xl.py
@@ -30,44 +30,36 @@ import torch

 from transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer

-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+)
 logger = logging.getLogger(__name__)

+
 def main():
-    parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
-    parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
-                        help='pretrained model name')
-    parser.add_argument('--split', type=str, default='test',
-                        choices=['all', 'valid', 'test'],
-                        help='which split to evaluate')
-    parser.add_argument('--batch_size', type=int, default=10,
-                        help='batch size')
-    parser.add_argument('--tgt_len', type=int, default=128,
-                        help='number of tokens to predict')
-    parser.add_argument('--ext_len', type=int, default=0,
-                        help='length of the extended context')
-    parser.add_argument('--mem_len', type=int, default=1600,
-                        help='length of the retained previous heads')
-    parser.add_argument('--clamp_len', type=int, default=1000,
-                        help='max positional embedding index')
-    parser.add_argument('--no_cuda', action='store_true',
-                        help='Do not use CUDA even though CUA is available')
-    parser.add_argument('--work_dir', type=str, required=True,
-                        help='path to the work_dir')
-    parser.add_argument('--no_log', action='store_true',
-                        help='do not log the eval result')
-    parser.add_argument('--same_length', action='store_true',
-                        help='set same length attention with masking')
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser = argparse.ArgumentParser(description="PyTorch Transformer Language Model")
+    parser.add_argument("--model_name", type=str, default="transfo-xl-wt103", help="pretrained model name")
+    parser.add_argument(
+        "--split", type=str, default="test", choices=["all", "valid", "test"], help="which split to evaluate"
+    )
+    parser.add_argument("--batch_size", type=int, default=10, help="batch size")
+    parser.add_argument("--tgt_len", type=int, default=128, help="number of tokens to predict")
+    parser.add_argument("--ext_len", type=int, default=0, help="length of the extended context")
+    parser.add_argument("--mem_len", type=int, default=1600, help="length of the retained previous heads")
+    parser.add_argument("--clamp_len", type=int, default=1000, help="max positional embedding index")
+    parser.add_argument("--no_cuda", action="store_true", help="Do not use CUDA even though CUA is available")
+    parser.add_argument("--work_dir", type=str, required=True, help="path to the work_dir")
+    parser.add_argument("--no_log", action="store_true", help="do not log the eval result")
+    parser.add_argument("--same_length", action="store_true", help="set same length attention with masking")
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
    args = parser.parse_args()
-    assert args.ext_len >= 0, 'extended context length must be non-negative'
+    assert args.ext_len >= 0, "extended context length must be non-negative"

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
+
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -84,17 +76,18 @@ def main():
    corpus = TransfoXLCorpus.from_pretrained(args.model_name)
    ntokens = len(corpus.vocab)

-    va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
-        device=device, ext_len=args.ext_len)
-    te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
-        device=device, ext_len=args.ext_len)
+    va_iter = corpus.get_iterator("valid", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
+    te_iter = corpus.get_iterator("test", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)

    # Load a pre-trained model
    model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
    model = model.to(device)

-    logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
-        args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
+    logger.info(
+        "Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}".format(
+            args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len
+        )
+    )

    model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
    if args.clamp_len > 0:
@@ -108,7 +101,7 @@ def main():
    def evaluate(eval_iter):
        # Turn on evaluation mode which disables dropout.
        model.eval()
-        total_len, total_loss = 0, 0.
+        total_len, total_loss = 0, 0.0
        start_time = time.time()
        with torch.no_grad():
            mems = None
@@ -119,35 +112,34 @@ def main():
                total_loss += seq_len * loss.item()
                total_len += seq_len
            total_time = time.time() - start_time
-        logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format(
-                total_time, 1000 * total_time / (idx+1)))
+        logger.info("Time : {:.2f}s, {:.2f}ms/segment".format(total_time, 1000 * total_time / (idx + 1)))
        return total_loss / total_len

    # Run on test data.
-    if args.split == 'all':
+    if args.split == "all":
        test_loss = evaluate(te_iter)
        valid_loss = evaluate(va_iter)
-    elif args.split == 'valid':
+    elif args.split == "valid":
        valid_loss = evaluate(va_iter)
        test_loss = None
-    elif args.split == 'test':
+    elif args.split == "test":
        test_loss = evaluate(te_iter)
        valid_loss = None

    def format_log(loss, split):
-        log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
-            split, loss, math.exp(loss))
+        log_str = "| {0} loss {1:5.2f} | {0} ppl {2:9.3f} ".format(split, loss, math.exp(loss))
        return log_str

-    log_str = ''
+    log_str = ""
    if valid_loss is not None:
-        log_str += format_log(valid_loss, 'valid')
+        log_str += format_log(valid_loss, "valid")
    if test_loss is not None:
-        log_str += format_log(test_loss, 'test')
+        log_str += format_log(test_loss, "test")

-    logger.info('=' * 100)
+    logger.info("=" * 100)
    logger.info(log_str)
-    logger.info('=' * 100)
+    logger.info("=" * 100)

-if __name__ == '__main__':
+
+if __name__ == "__main__":
    main()
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -40,14 +40,12 @@ from utils import logger
 from lm_seqs_dataset import LmSeqsDataset
 from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups

+
 class Distiller:
-    def __init__(self,
-                 params: dict,
-                 dataset: LmSeqsDataset,
-                 token_probs: torch.tensor,
-                 student: nn.Module,
-                 teacher: nn.Module):
-        logger.info('Initializing Distiller')
+    def __init__(
+        self, params: dict, dataset: LmSeqsDataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module
+    ):
+        logger.info("Initializing Distiller")
        self.params = params
        self.dump_path = params.dump_path
        self.multi_gpu = params.multi_gpu
@@ -70,12 +68,10 @@ class Distiller:
        else:
            sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False)

-        self.dataloader = DataLoader(dataset=dataset,
-                                     batch_sampler=sampler,
-                                     collate_fn=dataset.batch_sequences)
+        self.dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.batch_sequences)

        self.temperature = params.temperature
-        assert self.temperature > 0.
+        assert self.temperature > 0.0

        self.alpha_ce = params.alpha_ce
        self.alpha_mlm = params.alpha_mlm
@@ -85,18 +81,18 @@ class Distiller:

        self.mlm = params.mlm
        if self.mlm:
-            logger.info(f'Using MLM loss for LM step.')
+            logger.info(f"Using MLM loss for LM step.")
            self.mlm_mask_prop = params.mlm_mask_prop
            assert 0.0 <= self.mlm_mask_prop <= 1.0
            assert params.word_mask + params.word_keep + params.word_rand == 1.0
            self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
-            self.pred_probs = self.pred_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else self.pred_probs
-            self.token_probs = token_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else token_probs
+            self.pred_probs = self.pred_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else self.pred_probs
+            self.token_probs = token_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else token_probs
            if self.fp16:
                self.pred_probs = self.pred_probs.half()
                self.token_probs = self.token_probs.half()
        else:
-            logger.info(f'Using CLM loss for LM step.')
+            logger.info(f"Using CLM loss for LM step.")

        self.epoch = 0
        self.n_iter = 0
@@ -107,38 +103,54 @@ class Distiller:
        self.last_loss_ce = 0
        self.last_loss_mlm = 0
        self.last_loss_clm = 0
-        if self.alpha_mse > 0.: self.last_loss_mse = 0
-        if self.alpha_cos > 0.: self.last_loss_cos = 0
+        if self.alpha_mse > 0.0:
+            self.last_loss_mse = 0
+        if self.alpha_cos > 0.0:
+            self.last_loss_cos = 0
        self.last_log = 0

-        self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean')
+        self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean")
        self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
-        if self.alpha_mse > 0.:
-            self.mse_loss_fct = nn.MSELoss(reduction='sum')
-        if self.alpha_cos > 0.:
-            self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction='mean')
+        if self.alpha_mse > 0.0:
+            self.mse_loss_fct = nn.MSELoss(reduction="sum")
+        if self.alpha_cos > 0.0:
+            self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction="mean")

-        logger.info('--- Initializing model optimizer')
+        logger.info("--- Initializing model optimizer")
        assert params.gradient_accumulation_steps >= 1
        self.num_steps_epoch = len(self.dataloader)
-        num_train_optimization_steps = int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
+        num_train_optimization_steps = (
+            int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
+        )

-        no_decay = ['bias', 'LayerNorm.weight']
+        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
-            {'params': [p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': params.weight_decay},
-            {'params': [p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': 0.0}
+            {
+                "params": [
+                    p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad
+                ],
+                "weight_decay": params.weight_decay,
+            },
+            {
+                "params": [
+                    p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad
+                ],
+                "weight_decay": 0.0,
+            },
        ]
-        logger.info("------ Number of trainable parameters (student): %i" % sum([p.numel() for p in self.student.parameters() if p.requires_grad]))
+        logger.info(
+            "------ Number of trainable parameters (student): %i"
+            % sum([p.numel() for p in self.student.parameters() if p.requires_grad])
+        )
        logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()]))
-        self.optimizer = AdamW(optimizer_grouped_parameters,
-                               lr=params.learning_rate,
-                               eps=params.adam_epsilon,
-                               betas=(0.9, 0.98))
+        self.optimizer = AdamW(
+            optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98)
+        )

        warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
-        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
-                                                num_warmup_steps=warmup_steps,
-                                                num_training_steps=num_train_optimization_steps)
+        self.scheduler = get_linear_schedule_with_warmup(
+            self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps
+        )

        if self.fp16:
            try:
@@ -146,33 +158,36 @@ class Distiller:
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
            logger.info(f"Using fp16 training: {self.params.fp16_opt_level} level")
-            self.student, self.optimizer = amp.initialize(self.student,
-                                                          self.optimizer,
-                                                          opt_level=self.params.fp16_opt_level)
+            self.student, self.optimizer = amp.initialize(
+                self.student, self.optimizer, opt_level=self.params.fp16_opt_level
+            )
            self.teacher = self.teacher.half()

        if self.multi_gpu:
            if self.fp16:
                from apex.parallel import DistributedDataParallel
+
                logger.info("Using apex.parallel.DistributedDataParallel for distributed training.")
                self.student = DistributedDataParallel(self.student)
            else:
                from torch.nn.parallel import DistributedDataParallel
+
                logger.info("Using nn.parallel.DistributedDataParallel for distributed training.")
-                self.student = DistributedDataParallel(self.student,
-                                                       device_ids=[params.local_rank],
-                                                       output_device=params.local_rank,
-                                                       find_unused_parameters=True)
+                self.student = DistributedDataParallel(
+                    self.student,
+                    device_ids=[params.local_rank],
+                    output_device=params.local_rank,
+                    find_unused_parameters=True,
+                )

        self.is_master = params.is_master
        if self.is_master:
-            logger.info('--- Initializing Tensorboard')
-            self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, 'log', 'train'))
-            self.tensorboard.add_text(tag='config/training', text_string=str(self.params), global_step=0)
-            self.tensorboard.add_text(tag='config/student', text_string=str(self.student_config), global_step=0)
+            logger.info("--- Initializing Tensorboard")
+            self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, "log", "train"))
+            self.tensorboard.add_text(tag="config/training", text_string=str(self.params), global_step=0)
+            self.tensorboard.add_text(tag="config/student", text_string=str(self.student_config), global_step=0)

-    def prepare_batch_mlm(self,
-                          batch):
+    def prepare_batch_mlm(self, batch):
        """
        Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM.

@@ -192,7 +207,7 @@ class Distiller:
        token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
        assert token_ids.size(0) == lengths.size(0)

-        attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None])
+        attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]

        bs, max_seq_len = token_ids.size()
        mlm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
@@ -200,11 +215,13 @@ class Distiller:
        x_prob = self.token_probs[token_ids.flatten()]
        n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item())
        tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False)
-        pred_mask = torch.zeros(bs * max_seq_len, dtype=torch.bool, device=token_ids.device) # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
+        pred_mask = torch.zeros(
+            bs * max_seq_len, dtype=torch.bool, device=token_ids.device
+        )  # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
        pred_mask[tgt_ids] = 1
        pred_mask = pred_mask.view(bs, max_seq_len)

-        pred_mask[token_ids == self.params.special_tok_ids['pad_token']] = 0
+        pred_mask[token_ids == self.params.special_tok_ids["pad_token"]] = 0

        # mask a number of words == 0 [8] (faster with fp16)
        if self.fp16:
@@ -213,26 +230,29 @@ class Distiller:
                pred_mask = pred_mask.view(-1)
                n2 = max(n1 % 8, 8 * (n1 // 8))
                if n2 != n1:
-                    pred_mask[torch.nonzero(pred_mask).view(-1)[:n1-n2]] = 0
+                    pred_mask[torch.nonzero(pred_mask).view(-1)[: n1 - n2]] = 0
                pred_mask = pred_mask.view(bs, max_seq_len)
                assert pred_mask.sum().item() % 8 == 0, pred_mask.sum().item()

        _token_ids_real = token_ids[pred_mask]
        _token_ids_rand = _token_ids_real.clone().random_(self.vocab_size)
-        _token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids['mask_token'])
+        _token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids["mask_token"])
        probs = torch.multinomial(self.pred_probs, len(_token_ids_real), replacement=True)
-        _token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long()
+        _token_ids = (
+            _token_ids_mask * (probs == 0).long()
+            + _token_ids_real * (probs == 1).long()
+            + _token_ids_rand * (probs == 2).long()
+        )
        token_ids = token_ids.masked_scatter(pred_mask, _token_ids)

-        mlm_labels[~pred_mask] = -100 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
+        mlm_labels[~pred_mask] = -100  # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility

        # sanity checks
        assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size

        return token_ids, attn_mask, mlm_labels

-    def prepare_batch_clm(self,
-                          batch):
+    def prepare_batch_clm(self, batch):
        """
        Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the labels for CLM.

@@ -252,18 +272,16 @@ class Distiller:
        token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
        assert token_ids.size(0) == lengths.size(0)

-        attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None])
+        attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]
        clm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
-        clm_labels[~attn_mask] = -100 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
+        clm_labels[~attn_mask] = -100  # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility

        # sanity checks
        assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size

        return token_ids, attn_mask, clm_labels

-    def round_batch(self,
-                    x: torch.tensor,
-                    lengths: torch.tensor):
+    def round_batch(self, x: torch.tensor, lengths: torch.tensor):
        """
        For float16 only.
        Sub-sample sentences in a batch, and add padding, so that each dimension is a multiple of 8.
@@ -299,9 +317,9 @@ class Distiller:
            pad = 8 - (ml1 % 8)
            ml2 = ml1 + pad
            if self.mlm:
-                pad_id = self.params.special_tok_ids['pad_token']
+                pad_id = self.params.special_tok_ids["pad_token"]
            else:
-                pad_id = self.params.special_tok_ids['unk_token']
+                pad_id = self.params.special_tok_ids["unk_token"]
            padding_tensor = torch.zeros(bs2, pad, dtype=torch.long, device=x.device).fill_(pad_id)
            x = torch.cat([x, padding_tensor], 1)
            assert x.size() == (bs2, ml2)
@@ -314,20 +332,22 @@ class Distiller:
        """
        The real training loop.
        """
-        if self.is_master: logger.info('Starting training')
+        if self.is_master:
+            logger.info("Starting training")
        self.last_log = time.time()
        self.student.train()
        self.teacher.eval()

        for _ in range(self.params.n_epoch):
-            if self.is_master: logger.info(f'--- Starting epoch {self.epoch}/{self.params.n_epoch-1}')
+            if self.is_master:
+                logger.info(f"--- Starting epoch {self.epoch}/{self.params.n_epoch-1}")
            if self.multi_gpu:
                torch.distributed.barrier()

            iter_bar = tqdm(self.dataloader, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
            for batch in iter_bar:
                if self.params.n_gpu > 0:
-                    batch = tuple(t.to(f'cuda:{self.params.local_rank}') for t in batch)
+                    batch = tuple(t.to(f"cuda:{self.params.local_rank}") for t in batch)

                if self.mlm:
                    token_ids, attn_mask, lm_labels = self.prepare_batch_mlm(batch=batch)
@@ -336,22 +356,21 @@ class Distiller:
                self.step(input_ids=token_ids, attention_mask=attn_mask, lm_labels=lm_labels)

                iter_bar.update()
-                iter_bar.set_postfix({'Last_loss': f'{self.last_loss:.2f}',
-                                      'Avg_cum_loss': f'{self.total_loss_epoch/self.n_iter:.2f}'})
+                iter_bar.set_postfix(
+                    {"Last_loss": f"{self.last_loss:.2f}", "Avg_cum_loss": f"{self.total_loss_epoch/self.n_iter:.2f}"}
+                )
            iter_bar.close()

-            if self.is_master: logger.info(f'--- Ending epoch {self.epoch}/{self.params.n_epoch-1}')
+            if self.is_master:
+                logger.info(f"--- Ending epoch {self.epoch}/{self.params.n_epoch-1}")
            self.end_epoch()

        if self.is_master:
-            logger.info(f'Save very last checkpoint as `pytorch_model.bin`.')
-            self.save_checkpoint(checkpoint_name=f'pytorch_model.bin')
-            logger.info('Training is finished')
+            logger.info(f"Save very last checkpoint as `pytorch_model.bin`.")
+            self.save_checkpoint(checkpoint_name=f"pytorch_model.bin")
+            logger.info("Training is finished")

-    def step(self,
-             input_ids: torch.tensor,
-             attention_mask: torch.tensor,
-             lm_labels: torch.tensor):
+    def step(self, input_ids: torch.tensor, attention_mask: torch.tensor, lm_labels: torch.tensor):
        """
        One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation),
        and possibly a parameter update (depending on the gradient accumulation).
@@ -363,78 +382,91 @@ class Distiller:
        lm_labels: `torch.tensor(bs, seq_length)` - The language modeling labels (mlm labels for MLM and clm labels for CLM).
        """
        if self.mlm:
-            s_logits, s_hidden_states = self.student(input_ids=input_ids, attention_mask=attention_mask)     # (bs, seq_length, voc_size)
+            s_logits, s_hidden_states = self.student(
+                input_ids=input_ids, attention_mask=attention_mask
+            )  # (bs, seq_length, voc_size)
            with torch.no_grad():
-                t_logits, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
+                t_logits, t_hidden_states = self.teacher(
+                    input_ids=input_ids, attention_mask=attention_mask
+                )  # (bs, seq_length, voc_size)
        else:
-            s_logits, _, s_hidden_states = self.student(input_ids=input_ids, attention_mask=None)            # (bs, seq_length, voc_size)
+            s_logits, _, s_hidden_states = self.student(
+                input_ids=input_ids, attention_mask=None
+            )  # (bs, seq_length, voc_size)
            with torch.no_grad():
-                t_logits, _, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=None)           # (bs, seq_length, voc_size)
+                t_logits, _, t_hidden_states = self.teacher(
+                    input_ids=input_ids, attention_mask=None
+                )  # (bs, seq_length, voc_size)
        assert s_logits.size() == t_logits.size()

-        #https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
-        #https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
+        # https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
+        # https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
        if self.params.restrict_ce_to_mask:
-            mask = (lm_labels>-1).unsqueeze(-1).expand_as(s_logits)    # (bs, seq_lenth, voc_size)
+            mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits)  # (bs, seq_lenth, voc_size)
        else:
-            mask = attention_mask.unsqueeze(-1).expand_as(s_logits)    # (bs, seq_lenth, voc_size)
-        s_logits_slct = torch.masked_select(s_logits, mask)            # (bs * seq_length * voc_size) modulo the 1s in mask
-        s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1))      # (bs * seq_length, voc_size) modulo the 1s in mask
-        t_logits_slct = torch.masked_select(t_logits, mask)            # (bs * seq_length * voc_size) modulo the 1s in mask
-        t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1))      # (bs * seq_length, voc_size) modulo the 1s in mask
+            mask = attention_mask.unsqueeze(-1).expand_as(s_logits)  # (bs, seq_lenth, voc_size)
+        s_logits_slct = torch.masked_select(s_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
+        s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
+        t_logits_slct = torch.masked_select(t_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
+        t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
        assert t_logits_slct.size() == s_logits_slct.size()

-        loss_ce = self.ce_loss_fct(F.log_softmax(s_logits_slct/self.temperature, dim=-1),
-                                   F.softmax(t_logits_slct/self.temperature, dim=-1)) * (self.temperature)**2
-        loss = self.alpha_ce*loss_ce
+        loss_ce = (
+            self.ce_loss_fct(
+                F.log_softmax(s_logits_slct / self.temperature, dim=-1),
+                F.softmax(t_logits_slct / self.temperature, dim=-1),
+            )
+            * (self.temperature) ** 2
+        )
+        loss = self.alpha_ce * loss_ce

-        if self.alpha_mlm > 0.:
+        if self.alpha_mlm > 0.0:
            loss_mlm = self.lm_loss_fct(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1))
            loss += self.alpha_mlm * loss_mlm
-        if self.alpha_clm > 0.:
+        if self.alpha_clm > 0.0:
            shift_logits = s_logits[..., :-1, :].contiguous()
            shift_labels = lm_labels[..., 1:].contiguous()
-            loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                                        shift_labels.view(-1))
+            loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            loss += self.alpha_clm * loss_clm

-        if self.alpha_mse > 0.:
-            loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct)/s_logits_slct.size(0) # Reproducing batchmean reduction
+        if self.alpha_mse > 0.0:
+            loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct) / s_logits_slct.size(
+                0
+            )  # Reproducing batchmean reduction
            loss += self.alpha_mse * loss_mse
-        if self.alpha_cos > 0.:
-            s_hidden_states = s_hidden_states[-1]                              # (bs, seq_length, dim)
-            t_hidden_states = t_hidden_states[-1]                              # (bs, seq_length, dim)
-            mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states)     # (bs, seq_length, dim)
+        if self.alpha_cos > 0.0:
+            s_hidden_states = s_hidden_states[-1]  # (bs, seq_length, dim)
+            t_hidden_states = t_hidden_states[-1]  # (bs, seq_length, dim)
+            mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states)  # (bs, seq_length, dim)
            assert s_hidden_states.size() == t_hidden_states.size()
            dim = s_hidden_states.size(-1)
-            
-            s_hidden_states_slct = torch.masked_select(s_hidden_states, mask)        # (bs * seq_length * dim)
-            s_hidden_states_slct = s_hidden_states_slct.view(-1, dim)                # (bs * seq_length, dim)
-            t_hidden_states_slct = torch.masked_select(t_hidden_states, mask)        # (bs * seq_length * dim)
-            t_hidden_states_slct = t_hidden_states_slct.view(-1, dim)                # (bs * seq_length, dim)
-        
-            target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1) # (bs * seq_length,)
+
+            s_hidden_states_slct = torch.masked_select(s_hidden_states, mask)  # (bs * seq_length * dim)
+            s_hidden_states_slct = s_hidden_states_slct.view(-1, dim)  # (bs * seq_length, dim)
+            t_hidden_states_slct = torch.masked_select(t_hidden_states, mask)  # (bs * seq_length * dim)
+            t_hidden_states_slct = t_hidden_states_slct.view(-1, dim)  # (bs * seq_length, dim)
+
+            target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1)  # (bs * seq_length,)
            loss_cos = self.cosine_loss_fct(s_hidden_states_slct, t_hidden_states_slct, target)
            loss += self.alpha_cos * loss_cos

        self.total_loss_epoch += loss.item()
        self.last_loss = loss.item()
        self.last_loss_ce = loss_ce.item()
-        if self.alpha_mlm > 0.:
+        if self.alpha_mlm > 0.0:
            self.last_loss_mlm = loss_mlm.item()
-        if self.alpha_clm > 0.:
+        if self.alpha_clm > 0.0:
            self.last_loss_clm = loss_clm.item()
-        if self.alpha_mse > 0.:
+        if self.alpha_mse > 0.0:
            self.last_loss_mse = loss_mse.item()
-        if self.alpha_cos > 0.:
+        if self.alpha_cos > 0.0:
            self.last_loss_cos = loss_cos.item()

        self.optimize(loss)

        self.n_sequences_epoch += input_ids.size(0)

-    def optimize(self,
-                 loss):
+    def optimize(self, loss):
        """
        Normalization on the loss (gradient accumulation or distributed training), followed by
        backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation).
@@ -442,7 +474,7 @@ class Distiller:
        """
        # Check for NaN
        if (loss != loss).data.any():
-            logger.error('NaN detected')
+            logger.error("NaN detected")
            exit()

        if self.multi_gpu:
@@ -452,6 +484,7 @@ class Distiller:

        if self.fp16:
            from apex import amp
+
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
@@ -488,53 +521,84 @@ class Distiller:
            return

        for param_name, param in self.student.named_parameters():
-            self.tensorboard.add_scalar(tag='parameter_mean/' + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter)
-            self.tensorboard.add_scalar(tag='parameter_std/' + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter)
+            self.tensorboard.add_scalar(
+                tag="parameter_mean/" + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter
+            )
+            self.tensorboard.add_scalar(
+                tag="parameter_std/" + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter
+            )
            if param.grad is None:
                continue
-            self.tensorboard.add_scalar(tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(),global_step=self.n_total_iter)
-            self.tensorboard.add_scalar(tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter)
+            self.tensorboard.add_scalar(
+                tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(), global_step=self.n_total_iter
+            )
+            self.tensorboard.add_scalar(
+                tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter
+            )

-        self.tensorboard.add_scalar(tag="losses/cum_avg_loss_epoch", scalar_value=self.total_loss_epoch/self.n_iter, global_step=self.n_total_iter)
+        self.tensorboard.add_scalar(
+            tag="losses/cum_avg_loss_epoch",
+            scalar_value=self.total_loss_epoch / self.n_iter,
+            global_step=self.n_total_iter,
+        )
        self.tensorboard.add_scalar(tag="losses/loss", scalar_value=self.last_loss, global_step=self.n_total_iter)
-        self.tensorboard.add_scalar(tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter)
-        if self.alpha_mlm > 0.:
-            self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter)
-        if self.alpha_clm > 0.:
-            self.tensorboard.add_scalar(tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter)
-        if self.alpha_mse > 0.:
-            self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
-        if self.alpha_cos > 0.:
-            self.tensorboard.add_scalar(tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter)
-        self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter)
-        
-        self.tensorboard.add_scalar(tag="global/memory_usage", scalar_value=psutil.virtual_memory()._asdict()['used']/1_000_000, global_step=self.n_total_iter)
-        self.tensorboard.add_scalar(tag="global/speed", scalar_value=time.time()-self.last_log, global_step=self.n_total_iter)
+        self.tensorboard.add_scalar(
+            tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter
+        )
+        if self.alpha_mlm > 0.0:
+            self.tensorboard.add_scalar(
+                tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter
+            )
+        if self.alpha_clm > 0.0:
+            self.tensorboard.add_scalar(
+                tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter
+            )
+        if self.alpha_mse > 0.0:
+            self.tensorboard.add_scalar(
+                tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter
+            )
+        if self.alpha_cos > 0.0:
+            self.tensorboard.add_scalar(
+                tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter
+            )
+        self.tensorboard.add_scalar(
+            tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter
+        )
+
+        self.tensorboard.add_scalar(
+            tag="global/memory_usage",
+            scalar_value=psutil.virtual_memory()._asdict()["used"] / 1_000_000,
+            global_step=self.n_total_iter,
+        )
+        self.tensorboard.add_scalar(
+            tag="global/speed", scalar_value=time.time() - self.last_log, global_step=self.n_total_iter
+        )

    def end_epoch(self):
        """
        Finally arrived at the end of epoch (full pass on dataset).
        Do some tensorboard logging and checkpoint saving.
        """
-        logger.info(f'{self.n_sequences_epoch} sequences have been trained during this epoch.')
+        logger.info(f"{self.n_sequences_epoch} sequences have been trained during this epoch.")

        if self.is_master:
-            self.save_checkpoint(checkpoint_name=f'model_epoch_{self.epoch}.pth')
-            self.tensorboard.add_scalar(tag='epoch/loss', scalar_value=self.total_loss_epoch/self.n_iter, global_step=self.epoch)
+            self.save_checkpoint(checkpoint_name=f"model_epoch_{self.epoch}.pth")
+            self.tensorboard.add_scalar(
+                tag="epoch/loss", scalar_value=self.total_loss_epoch / self.n_iter, global_step=self.epoch
+            )

        self.epoch += 1
        self.n_sequences_epoch = 0
        self.n_iter = 0
        self.total_loss_epoch = 0

-    def save_checkpoint(self,
-                        checkpoint_name: str = 'checkpoint.pth'):
+    def save_checkpoint(self, checkpoint_name: str = "checkpoint.pth"):
        """
        Save the current state. Only by the master process.
        """
        if not self.is_master:
            return
-        mdl_to_save = self.student.module if hasattr(self.student, 'module') else self.student
+        mdl_to_save = self.student.module if hasattr(self.student, "module") else self.student
        mdl_to_save.config.save_pretrained(self.dump_path)
        state_dict = mdl_to_save.state_dict()
        torch.save(state_dict, os.path.join(self.dump_path, checkpoint_name))
--- a/examples/distillation/grouped_batch_sampler.py
+++ b/examples/distillation/grouped_batch_sampler.py
@@ -23,12 +23,14 @@ from torch.utils.data.sampler import BatchSampler, Sampler

 from utils import logger

+
 def _quantize(x, bins):
    bins = copy.deepcopy(bins)
    bins = sorted(bins)
    quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
    return quantized

+
 def create_lengths_groups(lengths, k=0):
    bins = np.arange(start=3, stop=k, step=4).tolist() if k > 0 else [10]
    groups = _quantize(lengths, bins)
@@ -39,6 +41,7 @@ def create_lengths_groups(lengths, k=0):
    logger.info("Count of instances per bin: {}".format(counts))
    return groups

+
 class GroupedBatchSampler(BatchSampler):
    """
    Wraps another sampler to yield a mini-batch of indices.
@@ -53,11 +56,11 @@ class GroupedBatchSampler(BatchSampler):
            0, i.e. they must be in the range [0, num_groups).
        batch_size (int): Size of mini-batch.
    """
+
    def __init__(self, sampler, group_ids, batch_size):
        if not isinstance(sampler, Sampler):
            raise ValueError(
-                "sampler should be an instance of "
-                "torch.utils.data.Sampler, but got sampler={}".format(sampler)
+                "sampler should be an instance of " "torch.utils.data.Sampler, but got sampler={}".format(sampler)
            )
        self.sampler = sampler
        self.group_ids = group_ids
@@ -73,7 +76,7 @@ class GroupedBatchSampler(BatchSampler):
            buffer_per_group[group_id].append(idx)
            samples_per_group[group_id].append(idx)
            if len(buffer_per_group[group_id]) == self.batch_size:
-                yield buffer_per_group[group_id] #TODO
+                yield buffer_per_group[group_id]  # TODO
                num_batches += 1
                del buffer_per_group[group_id]
            assert len(buffer_per_group[group_id]) < self.batch_size
@@ -90,8 +93,8 @@ class GroupedBatchSampler(BatchSampler):
            for group_id, idxs in sorted(buffer_per_group.items(), key=lambda x: x[0]):
                batch_idx.extend(idxs)
                if len(batch_idx) >= self.batch_size:
-                    yield batch_idx[:self.batch_size]
-                    batch_idx = batch_idx[self.batch_size:]
+                    yield batch_idx[: self.batch_size]
+                    batch_idx = batch_idx[self.batch_size :]
                    num_remaining -= 1
            if len(batch_idx) > 0:
                yield batch_idx
--- a/examples/distillation/lm_seqs_dataset.py
+++ b/examples/distillation/lm_seqs_dataset.py
@@ -21,6 +21,7 @@ from torch.utils.data import Dataset
 import numpy as np
 from utils import logger

+
 class LmSeqsDataset(Dataset):
    """Custom Dataset wrapping language modeling sequences.

@@ -32,9 +33,7 @@ class LmSeqsDataset(Dataset):
        data: `List[np.array[int]]
    """

-    def __init__(self,
-                 params,
-                 data):
+    def __init__(self, params, data):
        self.params = params

        self.token_ids = np.array(data)
@@ -57,7 +56,7 @@ class LmSeqsDataset(Dataset):
        Some sanity checks
        """
        assert len(self.token_ids) == len(self.lengths)
-        assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths))) 
+        assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths)))

    def remove_long_sequences(self):
        """
@@ -65,17 +64,17 @@ class LmSeqsDataset(Dataset):
        """
        max_len = self.params.max_model_input_size
        indices = self.lengths > max_len
-        logger.info(f'Splitting {sum(indices)} too long sequences.')
+        logger.info(f"Splitting {sum(indices)} too long sequences.")

        def divide_chunks(l, n):
-            return [l[i:i + n] for i in range(0, len(l), n)]
+            return [l[i : i + n] for i in range(0, len(l), n)]

        new_tok_ids = []
        new_lengths = []
        if self.params.mlm:
-            cls_id, sep_id = self.params.special_tok_ids['cls_token'], self.params.special_tok_ids['sep_token']
+            cls_id, sep_id = self.params.special_tok_ids["cls_token"], self.params.special_tok_ids["sep_token"]
        else:
-            cls_id, sep_id = self.params.special_tok_ids['bos_token'], self.params.special_tok_ids['eos_token']
+            cls_id, sep_id = self.params.special_tok_ids["bos_token"], self.params.special_tok_ids["eos_token"]

        for seq_, len_ in zip(self.token_ids, self.lengths):
            assert (seq_[0] == cls_id) and (seq_[-1] == sep_id), seq_
@@ -84,7 +83,7 @@ class LmSeqsDataset(Dataset):
                new_lengths.append(len_)
            else:
                sub_seqs = []
-                for sub_s in divide_chunks(seq_, max_len-2):
+                for sub_s in divide_chunks(seq_, max_len - 2):
                    if sub_s[0] != cls_id:
                        sub_s = np.insert(sub_s, 0, cls_id)
                    if sub_s[-1] != sep_id:
@@ -108,7 +107,7 @@ class LmSeqsDataset(Dataset):
        self.token_ids = self.token_ids[indices]
        self.lengths = self.lengths[indices]
        new_size = len(self)
-        logger.info(f'Remove {init_size - new_size} too short (<=11 tokens) sequences.')
+        logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.")

    def print_statistics(self):
        """
@@ -116,7 +115,7 @@ class LmSeqsDataset(Dataset):
        """
        if not self.params.is_master:
            return
-        logger.info(f'{len(self)} sequences')
+        logger.info(f"{len(self)} sequences")
        # data_len = sum(self.lengths)
        # nb_unique_tokens = len(Counter(list(chain(*self.token_ids))))
        # logger.info(f'{data_len} tokens ({nb_unique_tokens} unique)')
@@ -125,8 +124,7 @@ class LmSeqsDataset(Dataset):
        # nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
        # logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')

-    def batch_sequences(self,
-                        batch):
+    def batch_sequences(self, batch):
        """
        Do the padding and transform into torch.tensor.
        """
@@ -139,13 +137,13 @@ class LmSeqsDataset(Dataset):

        # Pad token ids
        if self.params.mlm:
-            pad_idx = self.params.special_tok_ids['pad_token']
+            pad_idx = self.params.special_tok_ids["pad_token"]
        else:
-            pad_idx = self.params.special_tok_ids['unk_token']
-        tk_ = [list(t.astype(int)) + [pad_idx]*(max_seq_len_-len(t)) for t in token_ids]
+            pad_idx = self.params.special_tok_ids["unk_token"]
+        tk_ = [list(t.astype(int)) + [pad_idx] * (max_seq_len_ - len(t)) for t in token_ids]
        assert len(tk_) == len(token_ids)
        assert all(len(t) == max_seq_len_ for t in tk_)

-        tk_t = torch.tensor(tk_)      # (bs, max_seq_len_)
+        tk_t = torch.tensor(tk_)  # (bs, max_seq_len_)
        lg_t = torch.tensor(lengths)  # (bs)
        return tk_t, lg_t
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@@ -25,8 +25,7 @@ import glob

 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 import torch.nn.functional as F
 import torch.nn as nn
@@ -38,19 +37,32 @@ except:

 from tqdm import tqdm, trange

-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForQuestionAnswering, BertTokenizer,
-                                  XLMConfig, XLMForQuestionAnswering,
-                                  XLMTokenizer, XLNetConfig,
-                                  XLNetForQuestionAnswering,
-                                  XLNetTokenizer,
-                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForQuestionAnswering,
+    BertTokenizer,
+    XLMConfig,
+    XLMForQuestionAnswering,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetForQuestionAnswering,
+    XLNetTokenizer,
+    DistilBertConfig,
+    DistilBertForQuestionAnswering,
+    DistilBertTokenizer,
+)

 from transformers import AdamW, get_linear_schedule_with_warmup

-from ..utils_squad import (read_squad_examples, convert_examples_to_features,
-                         RawResult, write_predictions,
-                         RawResultExtended, write_predictions_extended)
+from ..utils_squad import (
+    read_squad_examples,
+    convert_examples_to_features,
+    RawResult,
+    write_predictions,
+    RawResultExtended,
+    write_predictions_extended,
+)

 # The follwing import is the official SQuAD evaluation script (2.0).
 # You can remove it from the dependencies if you are using this script outside of the library
@@ -59,16 +71,18 @@ from ..utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
-                  for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), ()
+)

 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
-    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
-    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+    "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
+    "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
 }

+
 def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
@@ -76,9 +90,11 @@ def set_seed(args):
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

+
 def to_list(tensor):
    return tensor.detach().cpu().tolist()

+
 def train(args, train_dataset, model, tokenizer, teacher=None):
    """ Train the model """
    if args.local_rank in [-1, 0]:
@@ -95,13 +111,18 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
    if args.fp16:
        try:
            from apex import amp
@@ -115,17 +136,21 @@ def train(args, train_dataset, model, tokenizer, teacher=None):

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

@@ -141,40 +166,47 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
            if teacher is not None:
                teacher.eval()
            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':       batch[0],
-                      'attention_mask':  batch[1], 
-                      'start_positions': batch[3], 
-                      'end_positions':   batch[4]}
-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
-            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[5],
-                               'p_mask':       batch[6]})
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                "start_positions": batch[3],
+                "end_positions": batch[4],
+            }
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
            outputs = model(**inputs)
            loss, start_logits_stu, end_logits_stu = outputs

            # Distillation loss
            if teacher is not None:
-                if 'token_type_ids' not in inputs:
-                    inputs['token_type_ids'] = None if args.teacher_type == 'xlm' else batch[2]
+                if "token_type_ids" not in inputs:
+                    inputs["token_type_ids"] = None if args.teacher_type == "xlm" else batch[2]
                with torch.no_grad():
-                    start_logits_tea, end_logits_tea = teacher(input_ids=inputs['input_ids'],
-                                                               token_type_ids=inputs['token_type_ids'],
-                                                               attention_mask=inputs['attention_mask'])
+                    start_logits_tea, end_logits_tea = teacher(
+                        input_ids=inputs["input_ids"],
+                        token_type_ids=inputs["token_type_ids"],
+                        attention_mask=inputs["attention_mask"],
+                    )
                assert start_logits_tea.size() == start_logits_stu.size()
                assert end_logits_tea.size() == end_logits_stu.size()
-                
-                loss_fct = nn.KLDivLoss(reduction='batchmean')
-                loss_start = loss_fct(F.log_softmax(start_logits_stu/args.temperature, dim=-1),
-                                      F.softmax(start_logits_tea/args.temperature, dim=-1)) * (args.temperature**2)
-                loss_end = loss_fct(F.log_softmax(end_logits_stu/args.temperature, dim=-1),
-                                    F.softmax(end_logits_tea/args.temperature, dim=-1)) * (args.temperature**2)
-                loss_ce = (loss_start + loss_end)/2.

-                loss = args.alpha_ce*loss_ce + args.alpha_squad*loss
+                loss_fct = nn.KLDivLoss(reduction="batchmean")
+                loss_start = loss_fct(
+                    F.log_softmax(start_logits_stu / args.temperature, dim=-1),
+                    F.softmax(start_logits_tea / args.temperature, dim=-1),
+                ) * (args.temperature ** 2)
+                loss_end = loss_fct(
+                    F.log_softmax(end_logits_stu / args.temperature, dim=-1),
+                    F.softmax(end_logits_tea / args.temperature, dim=-1),
+                ) * (args.temperature ** 2)
+                loss_ce = (loss_start + loss_end) / 2.0
+
+                loss = args.alpha_ce * loss_ce + args.alpha_squad * loss

            if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

@@ -195,22 +227,26 @@ def train(args, train_dataset, model, tokenizer, teacher=None):

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
@@ -246,32 +282,31 @@ def evaluate(args, model, tokenizer, prefix=""):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1]
-                      }
-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]  # XLM don't use segment_ids
            example_indices = batch[3]
-            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[4],
-                               'p_mask':    batch[5]})
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
-            if args.model_type in ['xlnet', 'xlm']:
+            if args.model_type in ["xlnet", "xlm"]:
                # XLNet uses a more complex post-processing procedure
-                result = RawResultExtended(unique_id            = unique_id,
-                                           start_top_log_probs  = to_list(outputs[0][i]),
-                                           start_top_index      = to_list(outputs[1][i]),
-                                           end_top_log_probs    = to_list(outputs[2][i]),
-                                           end_top_index        = to_list(outputs[3][i]),
-                                           cls_logits           = to_list(outputs[4][i]))
+                result = RawResultExtended(
+                    unique_id=unique_id,
+                    start_top_log_probs=to_list(outputs[0][i]),
+                    start_top_index=to_list(outputs[1][i]),
+                    end_top_log_probs=to_list(outputs[2][i]),
+                    end_top_index=to_list(outputs[3][i]),
+                    cls_logits=to_list(outputs[4][i]),
+                )
            else:
-                result = RawResult(unique_id    = unique_id,
-                                   start_logits = to_list(outputs[0][i]),
-                                   end_logits   = to_list(outputs[1][i]))
+                result = RawResult(
+                    unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i])
+                )
            all_results.append(result)

    # Compute predictions
@@ -282,23 +317,44 @@ def evaluate(args, model, tokenizer, prefix=""):
    else:
        output_null_log_odds_file = None

-    if args.model_type in ['xlnet', 'xlm']:
+    if args.model_type in ["xlnet", "xlm"]:
        # XLNet uses a more complex post-processing procedure
-        write_predictions_extended(examples, features, all_results, args.n_best_size,
-                        args.max_answer_length, output_prediction_file,
-                        output_nbest_file, output_null_log_odds_file, args.predict_file,
-                        model.config.start_n_top, model.config.end_n_top,
-                        args.version_2_with_negative, tokenizer, args.verbose_logging)
+        write_predictions_extended(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            args.predict_file,
+            model.config.start_n_top,
+            model.config.end_n_top,
+            args.version_2_with_negative,
+            tokenizer,
+            args.verbose_logging,
+        )
    else:
-        write_predictions(examples, features, all_results, args.n_best_size,
-                        args.max_answer_length, args.do_lower_case, output_prediction_file,
-                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
-                        args.version_2_with_negative, args.null_score_diff_threshold)
+        write_predictions(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            args.do_lower_case,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            args.verbose_logging,
+            args.version_2_with_negative,
+            args.null_score_diff_threshold,
+        )

    # Evaluate with the official SQuAD script
-    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
-                                 pred_file=output_prediction_file,
-                                 na_prob_file=output_null_log_odds_file)
+    evaluate_options = EVAL_OPTS(
+        data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file
+    )
    results = evaluate_on_squad(evaluate_options)
    return results

@@ -309,24 +365,30 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal

    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
-    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
-        'dev' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length)))
+    cached_features_file = os.path.join(
+        os.path.dirname(input_file),
+        "cached_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+        ),
+    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", input_file)
-        examples = read_squad_examples(input_file=input_file,
-                                                is_training=not evaluate,
-                                                version_2_with_negative=args.version_2_with_negative)
-        features = convert_examples_to_features(examples=examples,
-                                                tokenizer=tokenizer,
-                                                max_seq_length=args.max_seq_length,
-                                                doc_stride=args.doc_stride,
-                                                max_query_length=args.max_query_length,
-                                                is_training=not evaluate)
+        examples = read_squad_examples(
+            input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative
+        )
+        features = convert_examples_to_features(
+            examples=examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=not evaluate,
+        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)
@@ -342,14 +404,21 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
    if evaluate:
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_example_index, all_cls_index, all_p_mask)
+        dataset = TensorDataset(
+            all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask
+        )
    else:
        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_start_positions, all_end_positions,
-                                all_cls_index, all_p_mask)
+        dataset = TensorDataset(
+            all_input_ids,
+            all_input_mask,
+            all_segment_ids,
+            all_start_positions,
+            all_end_positions,
+            all_cls_index,
+            all_p_mask,
+        )

    if output_examples:
        return dataset, examples, features
@@ -360,121 +429,213 @@ def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
-    parser.add_argument("--train_file", default=None, type=str, required=True,
-                        help="SQuAD json for training. E.g., train-v1.1.json")
-    parser.add_argument("--predict_file", default=None, type=str, required=True,
-                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model checkpoints and predictions will be written.")
+    parser.add_argument(
+        "--train_file", default=None, type=str, required=True, help="SQuAD json for training. E.g., train-v1.1.json"
+    )
+    parser.add_argument(
+        "--predict_file",
+        default=None,
+        type=str,
+        required=True,
+        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model checkpoints and predictions will be written.",
+    )

    # Distillation parameters (optional)
-    parser.add_argument('--teacher_type', default=None, type=str,
-                        help="Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for distillation.")
-    parser.add_argument('--teacher_name_or_path', default=None, type=str,
-                        help="Path to the already SQuAD fine-tuned teacher model. Only for distillation.")
-    parser.add_argument('--alpha_ce', default=0.5, type=float,
-                        help="Distillation loss linear weight. Only for distillation.")
-    parser.add_argument('--alpha_squad', default=0.5, type=float,
-                        help="True SQuAD loss linear weight. Only for distillation.")
-    parser.add_argument('--temperature', default=2.0, type=float,
-                        help="Distillation temperature. Only for distillation.")
+    parser.add_argument(
+        "--teacher_type",
+        default=None,
+        type=str,
+        help="Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for distillation.",
+    )
+    parser.add_argument(
+        "--teacher_name_or_path",
+        default=None,
+        type=str,
+        help="Path to the already SQuAD fine-tuned teacher model. Only for distillation.",
+    )
+    parser.add_argument(
+        "--alpha_ce", default=0.5, type=float, help="Distillation loss linear weight. Only for distillation."
+    )
+    parser.add_argument(
+        "--alpha_squad", default=0.5, type=float, help="True SQuAD loss linear weight. Only for distillation."
+    )
+    parser.add_argument(
+        "--temperature", default=2.0, type=float, help="Distillation temperature. Only for distillation."
+    )

    ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )

-    parser.add_argument('--version_2_with_negative', action='store_true',
-                        help='If true, the SQuAD examples contain some that do not have an answer.')
-    parser.add_argument('--null_score_diff_threshold', type=float, default=0.0,
-                        help="If null_score - best_non_null is greater than the threshold predict null.")
+    parser.add_argument(
+        "--version_2_with_negative",
+        action="store_true",
+        help="If true, the SQuAD examples contain some that do not have an answer.",
+    )
+    parser.add_argument(
+        "--null_score_diff_threshold",
+        type=float,
+        default=0.0,
+        help="If null_score - best_non_null is greater than the threshold predict null.",
+    )

-    parser.add_argument("--max_seq_length", default=384, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
-                             "longer than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--doc_stride", default=128, type=int,
-                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
-    parser.add_argument("--max_query_length", default=64, type=int,
-                        help="The maximum number of tokens for the question. Questions longer than this will "
-                             "be truncated to this length.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--max_seq_length",
+        default=384,
+        type=int,
+        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+        "longer than this will be truncated, and sequences shorter than this will be padded.",
+    )
+    parser.add_argument(
+        "--doc_stride",
+        default=128,
+        type=int,
+        help="When splitting up a long document into chunks, how much stride to take between chunks.",
+    )
+    parser.add_argument(
+        "--max_query_length",
+        default=64,
+        type=int,
+        help="The maximum number of tokens for the question. Questions longer than this will "
+        "be truncated to this length.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )

-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-    parser.add_argument("--n_best_size", default=20, type=int,
-                        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
-    parser.add_argument("--max_answer_length", default=30, type=int,
-                        help="The maximum length of an answer that can be generated. This is needed because the start "
-                             "and end predictions are not conditioned on one another.")
-    parser.add_argument("--verbose_logging", action='store_true',
-                        help="If true, all of the warnings related to data processing will be printed. "
-                             "A number of warnings are expected for a normal SQuAD evaluation.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+    parser.add_argument(
+        "--n_best_size",
+        default=20,
+        type=int,
+        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
+    )
+    parser.add_argument(
+        "--max_answer_length",
+        default=30,
+        type=int,
+        help="The maximum length of an answer that can be generated. This is needed because the start "
+        "and end predictions are not conditioned on one another.",
+    )
+    parser.add_argument(
+        "--verbose_logging",
+        action="store_true",
+        help="If true, all of the warnings related to data processing will be printed. "
+        "A number of warnings are expected for a normal SQuAD evaluation.",
+    )

-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
    args = parser.parse_args()

-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
+
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -486,16 +647,24 @@ def main():
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )

    # Set seed
    set_seed(args)
@@ -506,27 +675,34 @@ def main():

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )

    if args.teacher_type is not None:
        assert args.teacher_name_or_path is not None
-        assert args.alpha_ce > 0.
-        assert args.alpha_ce + args.alpha_squad > 0.
-        assert args.teacher_type != 'distilbert', "We constraint teachers not to be of type DistilBERT."
+        assert args.alpha_ce > 0.0
+        assert args.alpha_ce + args.alpha_squad > 0.0
+        assert args.teacher_type != "distilbert", "We constraint teachers not to be of type DistilBERT."
        teacher_config_class, teacher_model_class, _ = MODEL_CLASSES[args.teacher_type]
-        teacher_config = teacher_config_class.from_pretrained(args.teacher_name_or_path,
-                                                              cache_dir=args.cache_dir if args.cache_dir else None)
-        teacher = teacher_model_class.from_pretrained(args.teacher_name_or_path,
-                                                      config=teacher_config,
-                                                      cache_dir=args.cache_dir if args.cache_dir else None)
+        teacher_config = teacher_config_class.from_pretrained(
+            args.teacher_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None
+        )
+        teacher = teacher_model_class.from_pretrained(
+            args.teacher_name_or_path, config=teacher_config, cache_dir=args.cache_dir if args.cache_dir else None
+        )
        teacher.to(args.device)
    else:
        teacher = None
@@ -544,7 +720,6 @@ def main():
        global_step, tr_loss = train(args, train_dataset, model, tokenizer, teacher=teacher)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

-
    # Save the trained model and the tokenizer
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
@@ -554,41 +729,44 @@ def main():
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir, cache_dir=args.cache_dir if args.cache_dir else None)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir,
-                                                    do_lower_case=args.do_lower_case,
-                                                    cache_dir=args.cache_dir if args.cache_dir else None)
+        tokenizer = tokenizer_class.from_pretrained(
+            args.output_dir, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None
+        )
        model.to(args.device)

-
    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs

        logger.info("Evaluate the following checkpoints: %s", checkpoints)

        for checkpoint in checkpoints:
            # Reload the model
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint, cache_dir=args.cache_dir if args.cache_dir else None)
            model.to(args.device)

            # Evaluate
            result = evaluate(args, model, tokenizer, prefix=global_step)

-            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
+            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
            results.update(result)

    logger.info("Results: {}".format(results))
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -23,68 +23,65 @@ import numpy as np
 from transformers import BertTokenizer, RobertaTokenizer, GPT2Tokenizer
 import logging

-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+)
 logger = logging.getLogger(__name__)

+
 def main():
-    parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
-    parser.add_argument('--file_path', type=str, default='data/dump.txt',
-                        help='The path to the data.')
-    parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2'])
-    parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased',
-                        help="The tokenizer to use.")
-    parser.add_argument('--dump_file', type=str, default='data/dump',
-                        help='The dump file prefix.')
+    parser = argparse.ArgumentParser(
+        description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids)."
+    )
+    parser.add_argument("--file_path", type=str, default="data/dump.txt", help="The path to the data.")
+    parser.add_argument("--tokenizer_type", type=str, default="bert", choices=["bert", "roberta", "gpt2"])
+    parser.add_argument("--tokenizer_name", type=str, default="bert-base-uncased", help="The tokenizer to use.")
+    parser.add_argument("--dump_file", type=str, default="data/dump", help="The dump file prefix.")
    args = parser.parse_args()

-
-    logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
-    if args.tokenizer_type == 'bert':
+    logger.info(f"Loading Tokenizer ({args.tokenizer_name})")
+    if args.tokenizer_type == "bert":
        tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
-        bos = tokenizer.special_tokens_map['cls_token'] # `[CLS]`
-        sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]`
-    elif args.tokenizer_type == 'roberta':
+        bos = tokenizer.special_tokens_map["cls_token"]  # `[CLS]`
+        sep = tokenizer.special_tokens_map["sep_token"]  # `[SEP]`
+    elif args.tokenizer_type == "roberta":
        tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
-        bos = tokenizer.special_tokens_map['cls_token'] # `<s>`
-        sep = tokenizer.special_tokens_map['sep_token'] # `</s>`
-    elif args.tokenizer_type == 'gpt2':
+        bos = tokenizer.special_tokens_map["cls_token"]  # `<s>`
+        sep = tokenizer.special_tokens_map["sep_token"]  # `</s>`
+    elif args.tokenizer_type == "gpt2":
        tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
-        bos = tokenizer.special_tokens_map['bos_token'] # `<|endoftext|>`
-        sep = tokenizer.special_tokens_map['eos_token'] # `<|endoftext|>`    
+        bos = tokenizer.special_tokens_map["bos_token"]  # `<|endoftext|>`
+        sep = tokenizer.special_tokens_map["eos_token"]  # `<|endoftext|>`

-    logger.info(f'Loading text from {args.file_path}')
-    with open(args.file_path, 'r', encoding='utf8') as fp:
+    logger.info(f"Loading text from {args.file_path}")
+    with open(args.file_path, "r", encoding="utf8") as fp:
        data = fp.readlines()

-
-    logger.info(f'Start encoding')
-    logger.info(f'{len(data)} examples to process.')
+    logger.info(f"Start encoding")
+    logger.info(f"{len(data)} examples to process.")

    rslt = []
    iter = 0
    interval = 10000
    start = time.time()
    for text in data:
-        text = f'{bos} {text.strip()} {sep}'
+        text = f"{bos} {text.strip()} {sep}"
        token_ids = tokenizer.encode(text, add_special_tokens=False)
        rslt.append(token_ids)

        iter += 1
        if iter % interval == 0:
            end = time.time()
-            logger.info(f'{iter} examples processed. - {(end-start)/interval:.2f}s/expl')
+            logger.info(f"{iter} examples processed. - {(end-start)/interval:.2f}s/expl")
            start = time.time()
-    logger.info('Finished binarization')
-    logger.info(f'{len(data)} examples processed.')
+    logger.info("Finished binarization")
+    logger.info(f"{len(data)} examples processed.")

-
-    dp_file = f'{args.dump_file}.{args.tokenizer_name}.pickle'
+    dp_file = f"{args.dump_file}.{args.tokenizer_name}.pickle"
    rslt_ = [np.uint16(d) for d in rslt]
    random.shuffle(rslt_)
-    logger.info(f'Dump to {dp_file}')
-    with open(dp_file, 'wb') as handle:
+    logger.info(f"Dump to {dp_file}")
+    with open(dp_file, "wb") as handle:
        pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)


--- a/examples/distillation/scripts/extract.py
+++ b/examples/distillation/scripts/extract.py
@@ -20,70 +20,80 @@ from transformers import BertForMaskedLM, RobertaForMaskedLM, GPT2LMHeadModel
 import torch
 import argparse

-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation"
+    )
    parser.add_argument("--model_type", default="roberta", choices=["roberta", "gpt2"])
-    parser.add_argument("--model_name", default='roberta-large', type=str)
-    parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_roberta_048131723.pth', type=str)
-    parser.add_argument("--vocab_transform", action='store_true')
+    parser.add_argument("--model_name", default="roberta-large", type=str)
+    parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_roberta_048131723.pth", type=str)
+    parser.add_argument("--vocab_transform", action="store_true")
    args = parser.parse_args()

-
-    if args.model_type == 'roberta':
+    if args.model_type == "roberta":
        model = RobertaForMaskedLM.from_pretrained(args.model_name)
-        prefix = 'roberta'
-    elif args.model_type == 'gpt2':
+        prefix = "roberta"
+    elif args.model_type == "gpt2":
        model = GPT2LMHeadModel.from_pretrained(args.model_name)
-        prefix = 'transformer'
+        prefix = "transformer"

    state_dict = model.state_dict()
    compressed_sd = {}

    ### Embeddings ###
-    if args.model_type == 'gpt2':
-        for param_name in ['wte.weight', 'wpe.weight']:
-            compressed_sd[f'{prefix}.{param_name}'] = state_dict[f'{prefix}.{param_name}']
+    if args.model_type == "gpt2":
+        for param_name in ["wte.weight", "wpe.weight"]:
+            compressed_sd[f"{prefix}.{param_name}"] = state_dict[f"{prefix}.{param_name}"]
    else:
-        for w in ['word_embeddings', 'position_embeddings', 'token_type_embeddings']:
-            param_name = f'{prefix}.embeddings.{w}.weight'
+        for w in ["word_embeddings", "position_embeddings", "token_type_embeddings"]:
+            param_name = f"{prefix}.embeddings.{w}.weight"
            compressed_sd[param_name] = state_dict[param_name]
-        for w in ['weight', 'bias']:
-            param_name = f'{prefix}.embeddings.LayerNorm.{w}'
+        for w in ["weight", "bias"]:
+            param_name = f"{prefix}.embeddings.LayerNorm.{w}"
            compressed_sd[param_name] = state_dict[param_name]

    ### Transformer Blocks ###
    std_idx = 0
    for teacher_idx in [0, 2, 4, 7, 9, 11]:
-        if args.model_type == 'gpt2':
-            for layer in ['ln_1', 'attn.c_attn', 'attn.c_proj', 'ln_2', 'mlp.c_fc', 'mlp.c_proj']:
-                for w in ['weight', 'bias']:
-                    compressed_sd[f'{prefix}.h.{std_idx}.{layer}.{w}'] = \
-                        state_dict[f'{prefix}.h.{teacher_idx}.{layer}.{w}']
-            compressed_sd[f'{prefix}.h.{std_idx}.attn.bias'] = state_dict[f'{prefix}.h.{teacher_idx}.attn.bias']
+        if args.model_type == "gpt2":
+            for layer in ["ln_1", "attn.c_attn", "attn.c_proj", "ln_2", "mlp.c_fc", "mlp.c_proj"]:
+                for w in ["weight", "bias"]:
+                    compressed_sd[f"{prefix}.h.{std_idx}.{layer}.{w}"] = state_dict[
+                        f"{prefix}.h.{teacher_idx}.{layer}.{w}"
+                    ]
+            compressed_sd[f"{prefix}.h.{std_idx}.attn.bias"] = state_dict[f"{prefix}.h.{teacher_idx}.attn.bias"]
        else:
-            for layer in ['attention.self.query', 'attention.self.key', 'attention.self.value',
-                        'attention.output.dense', 'attention.output.LayerNorm',
-                        'intermediate.dense', 'output.dense', 'output.LayerNorm']:
-                for w in ['weight', 'bias']:
-                    compressed_sd[f'{prefix}.encoder.layer.{std_idx}.{layer}.{w}'] = \
-                        state_dict[f'{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}']
+            for layer in [
+                "attention.self.query",
+                "attention.self.key",
+                "attention.self.value",
+                "attention.output.dense",
+                "attention.output.LayerNorm",
+                "intermediate.dense",
+                "output.dense",
+                "output.LayerNorm",
+            ]:
+                for w in ["weight", "bias"]:
+                    compressed_sd[f"{prefix}.encoder.layer.{std_idx}.{layer}.{w}"] = state_dict[
+                        f"{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}"
+                    ]
        std_idx += 1

    ### Language Modeling Head ###s
-    if args.model_type == 'roberta':
-        for layer in ['lm_head.decoder.weight', 'lm_head.bias']:
-            compressed_sd[f'{layer}'] = state_dict[f'{layer}']
+    if args.model_type == "roberta":
+        for layer in ["lm_head.decoder.weight", "lm_head.bias"]:
+            compressed_sd[f"{layer}"] = state_dict[f"{layer}"]
        if args.vocab_transform:
-            for w in ['weight', 'bias']:
-                compressed_sd[f'lm_head.dense.{w}'] = state_dict[f'lm_head.dense.{w}']
-                compressed_sd[f'lm_head.layer_norm.{w}'] = state_dict[f'lm_head.layer_norm.{w}']
-    elif args.model_type == 'gpt2':
-        for w in ['weight', 'bias']:
-            compressed_sd[f'{prefix}.ln_f.{w}'] = state_dict[f'{prefix}.ln_f.{w}']
-        compressed_sd[f'lm_head.weight'] = state_dict[f'lm_head.weight']
+            for w in ["weight", "bias"]:
+                compressed_sd[f"lm_head.dense.{w}"] = state_dict[f"lm_head.dense.{w}"]
+                compressed_sd[f"lm_head.layer_norm.{w}"] = state_dict[f"lm_head.layer_norm.{w}"]
+    elif args.model_type == "gpt2":
+        for w in ["weight", "bias"]:
+            compressed_sd[f"{prefix}.ln_f.{w}"] = state_dict[f"{prefix}.ln_f.{w}"]
+        compressed_sd[f"lm_head.weight"] = state_dict[f"lm_head.weight"]

-    print(f'N layers selected for distillation: {std_idx}')
-    print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
+    print(f"N layers selected for distillation: {std_idx}")
+    print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")

-    print(f'Save transfered checkpoint to {args.dump_checkpoint}.')
+    print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
    torch.save(compressed_sd, args.dump_checkpoint)
--- a/examples/distillation/scripts/extract_distilbert.py
+++ b/examples/distillation/scripts/extract_distilbert.py
@@ -20,63 +20,70 @@ from transformers import BertForMaskedLM, RobertaForMaskedLM
 import torch
 import argparse

-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation"
+    )
    parser.add_argument("--model_type", default="bert", choices=["bert"])
-    parser.add_argument("--model_name", default='bert-base-uncased', type=str)
-    parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_bert-base-uncased_0247911.pth', type=str)
-    parser.add_argument("--vocab_transform", action='store_true')
+    parser.add_argument("--model_name", default="bert-base-uncased", type=str)
+    parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_bert-base-uncased_0247911.pth", type=str)
+    parser.add_argument("--vocab_transform", action="store_true")
    args = parser.parse_args()

-
-    if args.model_type == 'bert':
+    if args.model_type == "bert":
        model = BertForMaskedLM.from_pretrained(args.model_name)
-        prefix = 'bert'
+        prefix = "bert"
    else:
        raise ValueError(f'args.model_type should be "bert".')

    state_dict = model.state_dict()
    compressed_sd = {}

-    for w in ['word_embeddings', 'position_embeddings']:
-        compressed_sd[f'distilbert.embeddings.{w}.weight'] = \
-            state_dict[f'{prefix}.embeddings.{w}.weight']
-    for w in ['weight', 'bias']:
-        compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \
-            state_dict[f'{prefix}.embeddings.LayerNorm.{w}']
+    for w in ["word_embeddings", "position_embeddings"]:
+        compressed_sd[f"distilbert.embeddings.{w}.weight"] = state_dict[f"{prefix}.embeddings.{w}.weight"]
+    for w in ["weight", "bias"]:
+        compressed_sd[f"distilbert.embeddings.LayerNorm.{w}"] = state_dict[f"{prefix}.embeddings.LayerNorm.{w}"]

    std_idx = 0
    for teacher_idx in [0, 2, 4, 7, 9, 11]:
-        for w in ['weight', 'bias']:
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}']
+        for w in ["weight", "bias"]:
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}"
+            ]

-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}']
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}"
+            ]

-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}"
+            ]
        std_idx += 1

-    compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
-    compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
+    compressed_sd[f"vocab_projector.weight"] = state_dict[f"cls.predictions.decoder.weight"]
+    compressed_sd[f"vocab_projector.bias"] = state_dict[f"cls.predictions.bias"]
    if args.vocab_transform:
-        for w in ['weight', 'bias']:
-            compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
-            compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
+        for w in ["weight", "bias"]:
+            compressed_sd[f"vocab_transform.{w}"] = state_dict[f"cls.predictions.transform.dense.{w}"]
+            compressed_sd[f"vocab_layer_norm.{w}"] = state_dict[f"cls.predictions.transform.LayerNorm.{w}"]

-    print(f'N layers selected for distillation: {std_idx}')
-    print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
+    print(f"N layers selected for distillation: {std_idx}")
+    print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")

-    print(f'Save transfered checkpoint to {args.dump_checkpoint}.')
+    print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
    torch.save(compressed_sd, args.dump_checkpoint)
--- a/examples/distillation/scripts/token_counts.py
+++ b/examples/distillation/scripts/token_counts.py
@@ -20,32 +20,36 @@ import argparse
 import pickle
 import logging

-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+)
 logger = logging.getLogger(__name__)

-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)")
-    parser.add_argument("--data_file", type=str, default="data/dump.bert-base-uncased.pickle",
-                        help="The binarized dataset.")
-    parser.add_argument("--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle",
-                        help="The dump file.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)"
+    )
+    parser.add_argument(
+        "--data_file", type=str, default="data/dump.bert-base-uncased.pickle", help="The binarized dataset."
+    )
+    parser.add_argument(
+        "--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle", help="The dump file."
+    )
    parser.add_argument("--vocab_size", default=30522, type=int)
    args = parser.parse_args()

-    logger.info(f'Loading data from {args.data_file}')
-    with open(args.data_file, 'rb') as fp:
+    logger.info(f"Loading data from {args.data_file}")
+    with open(args.data_file, "rb") as fp:
        data = pickle.load(fp)

-    logger.info('Counting occurences for MLM.')
+    logger.info("Counting occurences for MLM.")
    counter = Counter()
    for tk_ids in data:
        counter.update(tk_ids)
-    counts = [0]*args.vocab_size
+    counts = [0] * args.vocab_size
    for k, v in counter.items():
        counts[k] = v

-    logger.info(f'Dump to {args.token_counts_dump}')
-    with open(args.token_counts_dump, 'wb') as handle:
+    logger.info(f"Dump to {args.token_counts_dump}")
+    with open(args.token_counts_dump, "wb") as handle:
        pickle.dump(counts, handle, protocol=pickle.HIGHEST_PROTOCOL)
--- a/examples/distillation/train.py
+++ b/examples/distillation/train.py
@@ -35,166 +35,200 @@ from lm_seqs_dataset import LmSeqsDataset


 MODEL_CLASSES = {
-    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
-    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
-    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
-    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)
+    "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
+    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
+    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
+    "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
 }

+
 def sanity_checks(args):
    """
    A bunch of args sanity checks to perform even starting...
    """
-    assert (args.mlm and args.alpha_mlm > 0.) or (not args.mlm and args.alpha_mlm == 0.)
-    assert (args.alpha_mlm > 0. and args.alpha_clm == 0.) or (args.alpha_mlm == 0. and args.alpha_clm > 0.)
+    assert (args.mlm and args.alpha_mlm > 0.0) or (not args.mlm and args.alpha_mlm == 0.0)
+    assert (args.alpha_mlm > 0.0 and args.alpha_clm == 0.0) or (args.alpha_mlm == 0.0 and args.alpha_clm > 0.0)
    if args.mlm:
        assert os.path.isfile(args.token_counts)
-        assert (args.student_type in ['roberta', 'distilbert']) and (args.teacher_type in ['roberta', 'bert'])
+        assert (args.student_type in ["roberta", "distilbert"]) and (args.teacher_type in ["roberta", "bert"])
    else:
-        assert (args.student_type in ['gpt2']) and (args.teacher_type in ['gpt2'])
+        assert (args.student_type in ["gpt2"]) and (args.teacher_type in ["gpt2"])

-    assert args.teacher_type == args.student_type or (args.student_type=='distilbert' and args.teacher_type=='bert')
+    assert args.teacher_type == args.student_type or (
+        args.student_type == "distilbert" and args.teacher_type == "bert"
+    )
    assert os.path.isfile(args.student_config)
    if args.student_pretrained_weights is not None:
        assert os.path.isfile(args.student_pretrained_weights)

-    if args.freeze_token_type_embds: assert args.student_type in ['roberta']
+    if args.freeze_token_type_embds:
+        assert args.student_type in ["roberta"]
+
+    assert args.alpha_ce >= 0.0
+    assert args.alpha_mlm >= 0.0
+    assert args.alpha_clm >= 0.0
+    assert args.alpha_mse >= 0.0
+    assert args.alpha_cos >= 0.0
+    assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0.0

-    assert args.alpha_ce >= 0.
-    assert args.alpha_mlm >= 0.
-    assert args.alpha_clm >= 0.
-    assert args.alpha_mse >= 0.
-    assert args.alpha_cos >= 0.
-    assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0.

 def freeze_pos_embeddings(student, args):
-    if args.student_type == 'roberta':
+    if args.student_type == "roberta":
        student.roberta.embeddings.position_embeddings.weight.requires_grad = False
-    elif args.student_type == 'gpt2':
+    elif args.student_type == "gpt2":
        student.transformer.wpe.weight.requires_grad = False

+
 def freeze_token_type_embeddings(student, args):
-    if args.student_type == 'roberta':
+    if args.student_type == "roberta":
        student.roberta.embeddings.token_type_embeddings.weight.requires_grad = False

+
 def main():
    parser = argparse.ArgumentParser(description="Training")
-    parser.add_argument("--force", action='store_true',
-                        help="Overwrite dump_path if it already exists.")
+    parser.add_argument("--force", action="store_true", help="Overwrite dump_path if it already exists.")

-    parser.add_argument("--dump_path", type=str, required=True,
-                        help="The output directory (log, checkpoints, parameters, etc.)")
-    parser.add_argument("--data_file", type=str, required=True,
-                        help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.")
+    parser.add_argument(
+        "--dump_path", type=str, required=True, help="The output directory (log, checkpoints, parameters, etc.)"
+    )
+    parser.add_argument(
+        "--data_file",
+        type=str,
+        required=True,
+        help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.",
+    )

-    parser.add_argument("--student_type", type=str, choices=["distilbert", "roberta", "gpt2"], required=True,
-                        help="The student type (DistilBERT, RoBERTa).")
-    parser.add_argument("--student_config", type=str, required=True,
-                        help="Path to the student configuration.")
-    parser.add_argument("--student_pretrained_weights", default=None, type=str,
-                        help="Load student initialization checkpoint.")
+    parser.add_argument(
+        "--student_type",
+        type=str,
+        choices=["distilbert", "roberta", "gpt2"],
+        required=True,
+        help="The student type (DistilBERT, RoBERTa).",
+    )
+    parser.add_argument("--student_config", type=str, required=True, help="Path to the student configuration.")
+    parser.add_argument(
+        "--student_pretrained_weights", default=None, type=str, help="Load student initialization checkpoint."
+    )

-    parser.add_argument("--teacher_type", choices=["bert", "roberta", "gpt2"], required=True,
-                        help="Teacher type (BERT, RoBERTa).")
-    parser.add_argument("--teacher_name", type=str, required=True,
-                        help="The teacher model.")
+    parser.add_argument(
+        "--teacher_type", choices=["bert", "roberta", "gpt2"], required=True, help="Teacher type (BERT, RoBERTa)."
+    )
+    parser.add_argument("--teacher_name", type=str, required=True, help="The teacher model.")

-    parser.add_argument("--temperature", default=2., type=float,
-                        help="Temperature for the softmax temperature.")
-    parser.add_argument("--alpha_ce", default=0.5, type=float,
-                        help="Linear weight for the distillation loss. Must be >=0.")
-    parser.add_argument("--alpha_mlm", default=0.0, type=float,
-                        help="Linear weight for the MLM loss. Must be >=0. Should be used in coonjunction with `mlm` flag.")
-    parser.add_argument("--alpha_clm", default=0.5, type=float,
-                        help="Linear weight for the CLM loss. Must be >=0.")
-    parser.add_argument("--alpha_mse", default=0.0, type=float,
-                        help="Linear weight of the MSE loss. Must be >=0.")
-    parser.add_argument("--alpha_cos", default=0.0, type=float,
-                        help="Linear weight of the cosine embedding loss. Must be >=0.")
+    parser.add_argument("--temperature", default=2.0, type=float, help="Temperature for the softmax temperature.")
+    parser.add_argument(
+        "--alpha_ce", default=0.5, type=float, help="Linear weight for the distillation loss. Must be >=0."
+    )
+    parser.add_argument(
+        "--alpha_mlm",
+        default=0.0,
+        type=float,
+        help="Linear weight for the MLM loss. Must be >=0. Should be used in coonjunction with `mlm` flag.",
+    )
+    parser.add_argument("--alpha_clm", default=0.5, type=float, help="Linear weight for the CLM loss. Must be >=0.")
+    parser.add_argument("--alpha_mse", default=0.0, type=float, help="Linear weight of the MSE loss. Must be >=0.")
+    parser.add_argument(
+        "--alpha_cos", default=0.0, type=float, help="Linear weight of the cosine embedding loss. Must be >=0."
+    )

-    parser.add_argument("--mlm", action="store_true",
-                        help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM.")
-    parser.add_argument("--mlm_mask_prop", default=0.15, type=float,
-                        help="Proportion of tokens for which we need to make a prediction.")
-    parser.add_argument("--word_mask", default=0.8, type=float,
-                        help="Proportion of tokens to mask out.")
-    parser.add_argument("--word_keep", default=0.1, type=float,
-                        help="Proportion of tokens to keep.")
-    parser.add_argument("--word_rand", default=0.1, type=float,
-                        help="Proportion of tokens to randomly replace.")
-    parser.add_argument("--mlm_smoothing", default=0.7, type=float,
-                        help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).")
-    parser.add_argument("--token_counts", type=str,
-                        help="The token counts in the data_file for MLM.")
+    parser.add_argument(
+        "--mlm", action="store_true", help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM."
+    )
+    parser.add_argument(
+        "--mlm_mask_prop",
+        default=0.15,
+        type=float,
+        help="Proportion of tokens for which we need to make a prediction.",
+    )
+    parser.add_argument("--word_mask", default=0.8, type=float, help="Proportion of tokens to mask out.")
+    parser.add_argument("--word_keep", default=0.1, type=float, help="Proportion of tokens to keep.")
+    parser.add_argument("--word_rand", default=0.1, type=float, help="Proportion of tokens to randomly replace.")
+    parser.add_argument(
+        "--mlm_smoothing",
+        default=0.7,
+        type=float,
+        help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).",
+    )
+    parser.add_argument("--token_counts", type=str, help="The token counts in the data_file for MLM.")

-    parser.add_argument("--restrict_ce_to_mask", action='store_true',
-                        help="If true, compute the distilation loss only the [MLM] prediction distribution.")
-    parser.add_argument("--freeze_pos_embs", action="store_true",
-                        help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.")
-    parser.add_argument("--freeze_token_type_embds", action="store_true",
-                        help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.")
+    parser.add_argument(
+        "--restrict_ce_to_mask",
+        action="store_true",
+        help="If true, compute the distilation loss only the [MLM] prediction distribution.",
+    )
+    parser.add_argument(
+        "--freeze_pos_embs",
+        action="store_true",
+        help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.",
+    )
+    parser.add_argument(
+        "--freeze_token_type_embds",
+        action="store_true",
+        help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.",
+    )

-    parser.add_argument("--n_epoch", type=int, default=3,
-                        help="Number of pass on the whole dataset.")
-    parser.add_argument("--batch_size", type=int, default=5,
-                        help="Batch size (for each process).")
-    parser.add_argument("--group_by_size", action='store_false',
-                        help="If true, group sequences that have similar length into the same batch. Default is true.")
+    parser.add_argument("--n_epoch", type=int, default=3, help="Number of pass on the whole dataset.")
+    parser.add_argument("--batch_size", type=int, default=5, help="Batch size (for each process).")
+    parser.add_argument(
+        "--group_by_size",
+        action="store_false",
+        help="If true, group sequences that have similar length into the same batch. Default is true.",
+    )

-    parser.add_argument("--gradient_accumulation_steps", type=int, default=50,
-                        help="Gradient accumulation for larger training batches.")
-    parser.add_argument("--warmup_prop", default=0.05, type=float,
-                        help="Linear warmup proportion.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--learning_rate", default=5e-4, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--adam_epsilon", default=1e-6, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=5.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--initializer_range", default=0.02, type=float,
-                        help="Random initialization range.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=50,
+        help="Gradient accumulation for larger training batches.",
+    )
+    parser.add_argument("--warmup_prop", default=0.05, type=float, help="Linear warmup proportion.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--learning_rate", default=5e-4, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=5.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--initializer_range", default=0.02, type=float, help="Random initialization range.")

-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--n_gpu", type=int, default=1,
-                        help="Number of GPUs in the node.")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="Distributed training - Local rank")
-    parser.add_argument("--seed", type=int, default=56,
-                        help="Random seed")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs in the node.")
+    parser.add_argument("--local_rank", type=int, default=-1, help="Distributed training - Local rank")
+    parser.add_argument("--seed", type=int, default=56, help="Random seed")

-    parser.add_argument("--log_interval", type=int, default=500,
-                        help="Tensorboard logging interval.")
-    parser.add_argument("--checkpoint_interval", type=int, default=4000,
-                        help="Checkpoint interval.")
+    parser.add_argument("--log_interval", type=int, default=500, help="Tensorboard logging interval.")
+    parser.add_argument("--checkpoint_interval", type=int, default=4000, help="Checkpoint interval.")
    args = parser.parse_args()
    sanity_checks(args)

-
    ## ARGS ##
    init_gpu_params(args)
    set_seed(args)
    if args.is_master:
        if os.path.exists(args.dump_path):
            if not args.force:
-                raise ValueError(f'Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it'
-                                   'Use `--force` if you want to overwrite it')
+                raise ValueError(
+                    f"Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it"
+                    "Use `--force` if you want to overwrite it"
+                )
            else:
                shutil.rmtree(args.dump_path)

        if not os.path.exists(args.dump_path):
            os.makedirs(args.dump_path)
-        logger.info(f'Experiment will be dumped and logged in {args.dump_path}')
-
+        logger.info(f"Experiment will be dumped and logged in {args.dump_path}")

        ### SAVE PARAMS ###
-        logger.info(f'Param: {args}')
-        with open(os.path.join(args.dump_path, 'parameters.json'), 'w') as f:
+        logger.info(f"Param: {args}")
+        with open(os.path.join(args.dump_path, "parameters.json"), "w") as f:
            json.dump(vars(args), f, indent=4)
        git_log(args.dump_path)

@@ -207,58 +241,50 @@ def main():
    for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
        idx = tokenizer.all_special_tokens.index(tok_symbol)
        special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
-    logger.info(f'Special tokens {special_tok_ids}')
+    logger.info(f"Special tokens {special_tok_ids}")
    args.special_tok_ids = special_tok_ids
    args.max_model_input_size = tokenizer.max_model_input_sizes[args.teacher_name]

-
    ## DATA LOADER ##
-    logger.info(f'Loading data from {args.data_file}')
-    with open(args.data_file, 'rb') as fp:
+    logger.info(f"Loading data from {args.data_file}")
+    with open(args.data_file, "rb") as fp:
        data = pickle.load(fp)

-
    if args.mlm:
-        logger.info(f'Loading token counts from {args.token_counts} (already pre-computed)')
-        with open(args.token_counts, 'rb') as fp:
+        logger.info(f"Loading token counts from {args.token_counts} (already pre-computed)")
+        with open(args.token_counts, "rb") as fp:
            counts = pickle.load(fp)
-        
+
        token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
        for idx in special_tok_ids.values():
-            token_probs[idx] = 0.  # do not predict special tokens
+            token_probs[idx] = 0.0  # do not predict special tokens
        token_probs = torch.from_numpy(token_probs)
    else:
        token_probs = None

-
    train_lm_seq_dataset = LmSeqsDataset(params=args, data=data)
-    logger.info(f'Data loader created.')
-
+    logger.info(f"Data loader created.")

    ## STUDENT ##
-    logger.info(f'Loading student config from {args.student_config}')
+    logger.info(f"Loading student config from {args.student_config}")
    stu_architecture_config = student_config_class.from_pretrained(args.student_config)
    stu_architecture_config.output_hidden_states = True

    if args.student_pretrained_weights is not None:
-        logger.info(f'Loading pretrained weights from {args.student_pretrained_weights}')
-        student = student_model_class.from_pretrained(args.student_pretrained_weights,
-                                                      config=stu_architecture_config)
+        logger.info(f"Loading pretrained weights from {args.student_pretrained_weights}")
+        student = student_model_class.from_pretrained(args.student_pretrained_weights, config=stu_architecture_config)
    else:
        student = student_model_class(stu_architecture_config)

-
    if args.n_gpu > 0:
-        student.to(f'cuda:{args.local_rank}')
-    logger.info(f'Student loaded.')
-
+        student.to(f"cuda:{args.local_rank}")
+    logger.info(f"Student loaded.")

    ## TEACHER ##
    teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True)
    if args.n_gpu > 0:
-        teacher.to(f'cuda:{args.local_rank}')
-    logger.info(f'Teacher loaded from {args.teacher_name}.')
-
+        teacher.to(f"cuda:{args.local_rank}")
+    logger.info(f"Teacher loaded from {args.teacher_name}.")

    ## FREEZING ##
    if args.freeze_pos_embs:
@@ -266,7 +292,6 @@ def main():
    if args.freeze_token_type_embds:
        freeze_token_type_embeddings(student, args)

-
    ## SANITY CHECKS ##
    assert student.config.vocab_size == teacher.config.vocab_size
    assert student.config.hidden_size == teacher.config.hidden_size
@@ -274,14 +299,11 @@ def main():
    if args.mlm:
        assert token_probs.size(0) == stu_architecture_config.vocab_size

-
    ## DISTILLER ##
    torch.cuda.empty_cache()
-    distiller = Distiller(params=args,
-                          dataset=train_lm_seq_dataset,
-                          token_probs=token_probs,
-                          student=student,
-                          teacher=teacher)
+    distiller = Distiller(
+        params=args, dataset=train_lm_seq_dataset, token_probs=token_probs, student=student, teacher=teacher
+    )
    distiller.train()
    logger.info("Let's go get some drinks.")

--- a/examples/distillation/utils.py
+++ b/examples/distillation/utils.py
@@ -23,9 +23,12 @@ import torch
 import numpy as np

 import logging
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
 logger = logging.getLogger(__name__)


@@ -35,12 +38,12 @@ def git_log(folder_path: str):
    """
    repo = git.Repo(search_parent_directories=True)
    repo_infos = {
-        'repo_id': str(repo),
-        'repo_sha': str(repo.head.object.hexsha),
-        'repo_branch': str(repo.active_branch)
+        "repo_id": str(repo),
+        "repo_sha": str(repo.head.object.hexsha),
+        "repo_branch": str(repo.active_branch),
    }

-    with open(os.path.join(folder_path, 'git_log.json'), 'w') as f:
+    with open(os.path.join(folder_path, "git_log.json"), "w") as f:
        json.dump(repo_infos, f, indent=4)


@@ -57,21 +60,21 @@ def init_gpu_params(params):

    assert torch.cuda.is_available()

-    logger.info('Initializing GPUs')
+    logger.info("Initializing GPUs")
    if params.n_gpu > 1:
        assert params.local_rank != -1

-        params.world_size = int(os.environ['WORLD_SIZE'])
-        params.n_gpu_per_node = int(os.environ['N_GPU_NODE'])
-        params.global_rank = int(os.environ['RANK'])
+        params.world_size = int(os.environ["WORLD_SIZE"])
+        params.n_gpu_per_node = int(os.environ["N_GPU_NODE"])
+        params.global_rank = int(os.environ["RANK"])

        # number of nodes / node ID
        params.n_nodes = params.world_size // params.n_gpu_per_node
        params.node_id = params.global_rank // params.n_gpu_per_node
        params.multi_gpu = True

-        assert params.n_nodes == int(os.environ['N_NODES'])
-        assert params.node_id == int(os.environ['NODE_RANK'])
+        assert params.n_nodes == int(os.environ["N_NODES"])
+        assert params.node_id == int(os.environ["NODE_RANK"])

    # local job (single GPU)
    else:
@@ -114,8 +117,7 @@ def init_gpu_params(params):
    if params.multi_gpu:
        logger.info("Initializing PyTorch distributed")
        torch.distributed.init_process_group(
-            init_method='env://',
-            backend='nccl',
+            init_method="env://", backend="nccl",
        )


--- a/examples/mm-imdb/run_mmimdb.py
+++ b/examples/mm-imdb/run_mmimdb.py
@@ -40,29 +40,49 @@ from tqdm import tqdm, trange

 from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_mmimdb_labels, get_image_transforms

-from transformers import (WEIGHTS_NAME,
-                          BertConfig, BertModel, BertTokenizer,
-                          RobertaConfig, RobertaModel, RobertaTokenizer,
-                          XLMConfig, XLMModel, XLMTokenizer,
-                          XLNetConfig, XLNetModel, XLNetTokenizer,
-                          DistilBertConfig, DistilBertModel, DistilBertTokenizer,
-                          AlbertConfig, AlbertModel, AlbertTokenizer,
-                          MMBTForClassification, MMBTConfig)
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertModel,
+    BertTokenizer,
+    RobertaConfig,
+    RobertaModel,
+    RobertaTokenizer,
+    XLMConfig,
+    XLMModel,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetModel,
+    XLNetTokenizer,
+    DistilBertConfig,
+    DistilBertModel,
+    DistilBertTokenizer,
+    AlbertConfig,
+    AlbertModel,
+    AlbertTokenizer,
+    MMBTForClassification,
+    MMBTConfig,
+)

 from transformers import AdamW, get_linear_schedule_with_warmup

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig,
-                                                                                RobertaConfig, DistilBertConfig)), ())
+ALL_MODELS = sum(
+    (
+        tuple(conf.pretrained_config_archive_map.keys())
+        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
+    ),
+    (),
+)

 MODEL_CLASSES = {
-    'bert': (BertConfig, BertModel, BertTokenizer),
-    'xlnet': (XLNetConfig, XLNetModel, XLNetTokenizer),
-    'xlm': (XLMConfig, XLMModel, XLMTokenizer),
-    'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertModel, DistilBertTokenizer),
-    'albert': (AlbertConfig, AlbertModel, AlbertTokenizer)
+    "bert": (BertConfig, BertModel, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetModel, XLNetTokenizer),
+    "xlm": (XLMConfig, XLMModel, XLMTokenizer),
+    "roberta": (RobertaConfig, RobertaModel, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertModel, DistilBertTokenizer),
+    "albert": (AlbertConfig, AlbertModel, AlbertTokenizer),
 }


@@ -81,10 +101,13 @@ def train(args, train_dataset, model, tokenizer, criterion):

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler,
-                                  batch_size=args.train_batch_size,
-                                  collate_fn=collate_fn,
-                                  num_workers=args.num_workers)
+    train_dataloader = DataLoader(
+        train_dataset,
+        sampler=train_sampler,
+        batch_size=args.train_batch_size,
+        collate_fn=collate_fn,
+        num_workers=args.num_workers,
+    )

    if args.max_steps > 0:
        t_total = args.max_steps
@@ -93,14 +116,19 @@ def train(args, train_dataset, model, tokenizer, criterion):
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
    if args.fp16:
        try:
            from apex import amp
@@ -114,17 +142,21 @@ def train(args, train_dataset, model, tokenizer, criterion):

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

@@ -140,17 +172,19 @@ def train(args, train_dataset, model, tokenizer, criterion):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            labels = batch[5]
-            inputs = {'input_ids':      batch[0],
-                      'input_modal':    batch[2],
-                      'attention_mask': batch[1],
-                      'modal_start_tokens': batch[3],
-                      'modal_end_tokens': batch[4]}
+            inputs = {
+                "input_ids": batch[0],
+                "input_modal": batch[2],
+                "attention_mask": batch[1],
+                "modal_start_tokens": batch[3],
+                "modal_end_tokens": batch[4],
+            }
            outputs = model(**inputs)
            logits = outputs[0]  # model outputs are always tuple in transformers (see doc)
            loss = criterion(logits, labels)

            if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

@@ -174,30 +208,34 @@ def train(args, train_dataset, model, tokenizer, criterion):

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logs = {}
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer, criterion)
                        for key, value in results.items():
-                            eval_key = 'eval_{}'.format(key)
+                            eval_key = "eval_{}".format(key)
                            logs[eval_key] = value

                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()[0]
-                    logs['learning_rate'] = learning_rate_scalar
-                    logs['loss'] = loss_scalar
+                    logs["learning_rate"] = learning_rate_scalar
+                    logs["loss"] = loss_scalar
                    logging_loss = tr_loss

                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
-                    print(json.dumps({**logs, **{'step': global_step}}))
+                    print(json.dumps({**logs, **{"step": global_step}}))

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                    torch.save(model_to_save.state_dict(), os.path.join(output_dir, WEIGHTS_NAME))
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
@@ -209,13 +247,13 @@ def train(args, train_dataset, model, tokenizer, criterion):

        if args.local_rank == -1:
            results = evaluate(args, model, tokenizer, criterion)
-            if results['micro_f1'] > best_f1:
-                best_f1 = results['micro_f1']
+            if results["micro_f1"] > best_f1:
+                best_f1 = results["micro_f1"]
                n_no_improve = 0
            else:
                n_no_improve += 1

-            if  n_no_improve > args.patience:
+            if n_no_improve > args.patience:
                train_iterator.close()
                break

@@ -236,7 +274,9 @@ def evaluate(args, model, tokenizer, criterion, prefix=""):
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset)
-    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn)
+    eval_dataloader = DataLoader(
+        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn
+    )

    # multi-gpu eval
    if args.n_gpu > 1:
@@ -257,11 +297,13 @@ def evaluate(args, model, tokenizer, criterion, prefix=""):
        with torch.no_grad():
            batch = tuple(t.to(args.device) for t in batch)
            labels = batch[5]
-            inputs = {'input_ids': batch[0],
-                      'input_modal': batch[2],
-                      'attention_mask': batch[1],
-                      'modal_start_tokens': batch[3],
-                      'modal_end_tokens': batch[4]}
+            inputs = {
+                "input_ids": batch[0],
+                "input_modal": batch[2],
+                "attention_mask": batch[1],
+                "modal_start_tokens": batch[3],
+                "modal_end_tokens": batch[4],
+            }
            outputs = model(**inputs)
            logits = outputs[0]  # model outputs are always tuple in transformers (see doc)
            tmp_eval_loss = criterion(logits, labels)
@@ -278,7 +320,7 @@ def evaluate(args, model, tokenizer, criterion, prefix=""):
    result = {
        "loss": eval_loss,
        "macro_f1": f1_score(out_label_ids, preds, average="macro"),
-        "micro_f1": f1_score(out_label_ids, preds, average="micro")
+        "micro_f1": f1_score(out_label_ids, preds, average="micro"),
    }

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
@@ -303,94 +345,147 @@ def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .jsonl files for MMIMDB.")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .jsonl files for MMIMDB.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )

    ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--num_image_embeds", default=1, type=int,
-                        help="Number of Image Embeddings from the Image Encoder")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument(
+        "--num_image_embeds", default=1, type=int, help="Number of Image Embeddings from the Image Encoder"
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )

-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--patience", default=5, type=int,
-                        help="Patience for Early Stopping.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument("--patience", default=5, type=int, help="Patience for Early Stopping.")
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--num_workers', type=int, default=8,
-                        help="number of worker threads for dataloading")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument("--num_workers", type=int, default=8, help="number of worker threads for dataloading")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    args = parser.parse_args()

-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
+
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -402,17 +497,25 @@ def main():
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1

    args.device = device

    # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )

    # Set seed
    set_seed(args)
@@ -426,13 +529,17 @@ def main():
    num_labels = len(labels)
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    transformer_config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    transformer = model_class.from_pretrained(args.model_name_or_path,
-                                              config=transformer_config,
-                                              cache_dir=args.cache_dir if args.cache_dir else None)
+    transformer_config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    transformer = model_class.from_pretrained(
+        args.model_name_or_path, config=transformer_config, cache_dir=args.cache_dir if args.cache_dir else None
+    )
    img_encoder = ImageEncoder(args)
    config = MMBTConfig(transformer_config, num_labels=num_labels)
    model = MMBTForClassification(config, transformer, img_encoder)
@@ -449,12 +556,13 @@ def main():
        train_dataset = load_examples(args, tokenizer, evaluate=False)
        label_frequences = train_dataset.get_label_frequencies()
        label_frequences = [label_frequences[l] for l in labels]
-        label_weights = (torch.tensor(label_frequences, device=args.device, dtype=torch.float) / len(train_dataset)) ** -1
+        label_weights = (
+            torch.tensor(label_frequences, device=args.device, dtype=torch.float) / len(train_dataset)
+        ) ** -1
        criterion = nn.BCEWithLogitsLoss(pos_weight=label_weights)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer, criterion)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

-
    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
@@ -464,12 +572,14 @@ def main():
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
        torch.save(model_to_save.state_dict(), os.path.join(args.output_dir, WEIGHTS_NAME))
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = MMBTForClassification(config, transformer, img_encoder)
@@ -477,24 +587,25 @@ def main():
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

-
    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
            model = MMBTForClassification(config, transformer, img_encoder)
            model.load_state_dict(torch.load(checkpoint))
            model.to(args.device)
            result = evaluate(args, model, tokenizer, criterion, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results
--- a/examples/mm-imdb/utils_mmimdb.py
+++ b/examples/mm-imdb/utils_mmimdb.py
@@ -25,17 +25,7 @@ import torchvision
 import torchvision.transforms as transforms
 from torch.utils.data import Dataset

-POOLING_BREAKDOWN = {
-    1: (1, 1),
-    2: (2, 1),
-    3: (3, 1),
-    4: (2, 2),
-    5: (5, 1),
-    6: (3, 2),
-    7: (7, 1),
-    8: (4, 2),
-    9: (3, 3)
-}
+POOLING_BREAKDOWN = {1: (1, 1), 2: (2, 1), 3: (3, 1), 4: (2, 2), 5: (5, 1), 6: (3, 2), 7: (7, 1), 8: (4, 2), 9: (3, 3)}


 class ImageEncoder(nn.Module):
@@ -54,7 +44,6 @@ class ImageEncoder(nn.Module):
        return out  # BxNx2048


-
 class JsonlDataset(Dataset):
    def __init__(self, data_path, tokenizer, transforms, labels, max_seq_length):
        self.data = [json.loads(l) for l in open(data_path)]
@@ -72,7 +61,7 @@ class JsonlDataset(Dataset):
    def __getitem__(self, index):
        sentence = torch.LongTensor(self.tokenizer.encode(self.data[index]["text"], add_special_tokens=True))
        start_token, sentence, end_token = sentence[0], sentence[1:-1], sentence[-1]
-        sentence = sentence[:self.max_seq_length]
+        sentence = sentence[: self.max_seq_length]

        label = torch.zeros(self.n_classes)
        label[[self.labels.index(tgt) for tgt in self.data[index]["label"]]] = 1
@@ -80,8 +69,13 @@ class JsonlDataset(Dataset):
        image = Image.open(os.path.join(self.data_dir, self.data[index]["img"])).convert("RGB")
        image = self.transforms(image)

-        return {"image_start_token": start_token, "image_end_token": end_token,
-                "sentence": sentence, "image": image, "label": label}
+        return {
+            "image_start_token": start_token,
+            "image_end_token": end_token,
+            "sentence": sentence,
+            "image": image,
+            "label": label,
+        }

    def get_label_frequencies(self):
        label_freqs = Counter()
@@ -110,10 +104,31 @@ def collate_fn(batch):


 def get_mmimdb_labels():
-    return ['Crime', 'Drama', 'Thriller', 'Action', 'Comedy', 'Romance',
-            'Documentary', 'Short', 'Mystery', 'History', 'Family', 'Adventure',
-            'Fantasy', 'Sci-Fi', 'Western', 'Horror', 'Sport', 'War', 'Music',
-            'Musical', 'Animation', 'Biography', 'Film-Noir']
+    return [
+        "Crime",
+        "Drama",
+        "Thriller",
+        "Action",
+        "Comedy",
+        "Romance",
+        "Documentary",
+        "Short",
+        "Mystery",
+        "History",
+        "Family",
+        "Adventure",
+        "Fantasy",
+        "Sci-Fi",
+        "Western",
+        "Horror",
+        "Sport",
+        "War",
+        "Music",
+        "Musical",
+        "Animation",
+        "Biography",
+        "Film-Noir",
+    ]


 def get_image_transforms():
@@ -122,9 +137,6 @@ def get_image_transforms():
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=[0.46777044, 0.44531429, 0.40661017],
-                std=[0.12221994, 0.12145835, 0.14380469],
-            ),
+            transforms.Normalize(mean=[0.46777044, 0.44531429, 0.40661017], std=[0.12221994, 0.12145835, 0.14380469],),
        ]
    )
--- a/examples/pplm/pplm_classification_head.py
+++ b/examples/pplm/pplm_classification_head.py
@@ -1,5 +1,6 @@
 import torch

+
 class ClassificationHead(torch.nn.Module):
    """Classification Head for  transformer encoders"""

--- a/examples/pplm/run_pplm.py
+++ b/examples/pplm/run_pplm.py
@@ -1,19 +1,19 @@
 #! /usr/bin/env python3
 # coding=utf-8

-#Copyright (c) 2019 Uber Technologies, Inc.
+# Copyright (c) 2019 Uber Technologies, Inc.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 """
 Example command with bag of words:
@@ -46,13 +46,13 @@ SMALL_CONST = 1e-15
 BIG_CONST = 1e10

 BAG_OF_WORDS_ARCHIVE_MAP = {
-    'legal': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/legal.txt",
-    'military': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt",
-    'politics': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/politics.txt",
-    'religion': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/religion.txt",
-    'science': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/science.txt",
-    'space': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/space.txt",
-    'technology': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/technology.txt",
+    "legal": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/legal.txt",
+    "military": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt",
+    "politics": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/politics.txt",
+    "religion": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/religion.txt",
+    "science": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/science.txt",
+    "space": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/space.txt",
+    "technology": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/technology.txt",
 }

 DISCRIMINATOR_MODELS_PARAMS = {
@@ -75,10 +75,10 @@ DISCRIMINATOR_MODELS_PARAMS = {
 }


-def to_var(x, requires_grad=False, volatile=False, device='cuda'):
-    if torch.cuda.is_available() and device == 'cuda':
+def to_var(x, requires_grad=False, volatile=False, device="cuda"):
+    if torch.cuda.is_available() and device == "cuda":
        x = x.cuda()
-    elif device != 'cuda':
+    elif device != "cuda":
        x = x.to(device)
    return Variable(x, requires_grad=requires_grad, volatile=volatile)

@@ -95,49 +95,39 @@ def top_k_filter(logits, k, probs=False):
        values = torch.topk(logits, k)[0]
        batch_mins = values[:, -1].view(-1, 1).expand_as(logits)
        if probs:
-            return torch.where(logits < batch_mins,
-                               torch.ones_like(logits) * 0.0, logits)
-        return torch.where(logits < batch_mins,
-                           torch.ones_like(logits) * -BIG_CONST,
-                           logits)
+            return torch.where(logits < batch_mins, torch.ones_like(logits) * 0.0, logits)
+        return torch.where(logits < batch_mins, torch.ones_like(logits) * -BIG_CONST, logits)


 def perturb_past(
-        past,
-        model,
-        last,
-        unpert_past=None,
-        unpert_logits=None,
-        accumulated_hidden=None,
-        grad_norms=None,
-        stepsize=0.01,
-        one_hot_bows_vectors=None,
-        classifier=None,
-        class_label=None,
-        loss_type=0,
-        num_iterations=3,
-        horizon_length=1,
-        window_length=0,
-        decay=False,
-        gamma=1.5,
-        kl_scale=0.01,
-        device='cuda',
+    past,
+    model,
+    last,
+    unpert_past=None,
+    unpert_logits=None,
+    accumulated_hidden=None,
+    grad_norms=None,
+    stepsize=0.01,
+    one_hot_bows_vectors=None,
+    classifier=None,
+    class_label=None,
+    loss_type=0,
+    num_iterations=3,
+    horizon_length=1,
+    window_length=0,
+    decay=False,
+    gamma=1.5,
+    kl_scale=0.01,
+    device="cuda",
 ):
    # Generate inital perturbed past
-    grad_accumulator = [
-        (np.zeros(p.shape).astype("float32"))
-        for p in past
-    ]
+    grad_accumulator = [(np.zeros(p.shape).astype("float32")) for p in past]

    if accumulated_hidden is None:
        accumulated_hidden = 0

    if decay:
-        decay_mask = torch.arange(
-            0.,
-            1.0 + SMALL_CONST,
-            1.0 / (window_length)
-        )[1:]
+        decay_mask = torch.arange(0.0, 1.0 + SMALL_CONST, 1.0 / (window_length))[1:]
    else:
        decay_mask = 1.0

@@ -146,26 +136,17 @@ def perturb_past(
    _, _, _, curr_length, _ = past[0].shape

    if curr_length > window_length and window_length > 0:
-        ones_key_val_shape = (
-                tuple(past[0].shape[:-2])
-                + tuple([window_length])
-                + tuple(past[0].shape[-1:])
-        )
+        ones_key_val_shape = tuple(past[0].shape[:-2]) + tuple([window_length]) + tuple(past[0].shape[-1:])

        zeros_key_val_shape = (
-                tuple(past[0].shape[:-2])
-                + tuple([curr_length - window_length])
-                + tuple(past[0].shape[-1:])
+            tuple(past[0].shape[:-2]) + tuple([curr_length - window_length]) + tuple(past[0].shape[-1:])
        )

        ones_mask = torch.ones(ones_key_val_shape)
        ones_mask = decay_mask * ones_mask.permute(0, 1, 2, 4, 3)
        ones_mask = ones_mask.permute(0, 1, 2, 4, 3)

-        window_mask = torch.cat(
-            (ones_mask, torch.zeros(zeros_key_val_shape)),
-            dim=-2
-        ).to(device)
+        window_mask = torch.cat((ones_mask, torch.zeros(zeros_key_val_shape)), dim=-2).to(device)
    else:
        window_mask = torch.ones_like(past[0]).to(device)

@@ -175,8 +156,7 @@ def perturb_past(
    for i in range(num_iterations):
        print("Iteration ", i + 1)
        curr_perturbation = [
-            to_var(torch.from_numpy(p_), requires_grad=True, device=device)
-            for p_ in grad_accumulator
+            to_var(torch.from_numpy(p_), requires_grad=True, device=device) for p_ in grad_accumulator
        ]

        # Compute hidden using perturbed past
@@ -184,10 +164,7 @@ def perturb_past(
        _, _, _, curr_length, _ = curr_perturbation[0].shape
        all_logits, _, all_hidden = model(last, past=perturbed_past)
        hidden = all_hidden[-1]
-        new_accumulated_hidden = accumulated_hidden + torch.sum(
-            hidden,
-            dim=1
-        ).detach()
+        new_accumulated_hidden = accumulated_hidden + torch.sum(hidden, dim=1).detach()
        # TODO: Check the layer-norm consistency of this with trained discriminator (Sumanth)
        logits = all_logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
@@ -210,20 +187,13 @@ def perturb_past(
            wte = model.resize_token_embeddings()
            for _ in range(horizon_length):
                inputs_embeds = torch.matmul(curr_probs, wte.weight.data)
-                _, curr_unpert_past, curr_all_hidden = model(
-                    past=curr_unpert_past,
-                    inputs_embeds=inputs_embeds
-                )
+                _, curr_unpert_past, curr_all_hidden = model(past=curr_unpert_past, inputs_embeds=inputs_embeds)
                curr_hidden = curr_all_hidden[-1]
-                new_accumulated_hidden = new_accumulated_hidden + torch.sum(
-                    curr_hidden, dim=1)
+                new_accumulated_hidden = new_accumulated_hidden + torch.sum(curr_hidden, dim=1)

-            prediction = classifier(new_accumulated_hidden /
-                                    (curr_length + 1 + horizon_length))
+            prediction = classifier(new_accumulated_hidden / (curr_length + 1 + horizon_length))

-            label = torch.tensor(prediction.shape[0] * [class_label],
-                                 device=device,
-                                 dtype=torch.long)
+            label = torch.tensor(prediction.shape[0] * [class_label], device=device, dtype=torch.long)
            discrim_loss = ce_loss(prediction, label)
            print(" pplm_discrim_loss:", discrim_loss.data.cpu().numpy())
            loss += discrim_loss
@@ -232,21 +202,15 @@ def perturb_past(
        kl_loss = 0.0
        if kl_scale > 0.0:
            unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
-            unpert_probs = (
-                    unpert_probs + SMALL_CONST *
-                    (unpert_probs <= SMALL_CONST).float().to(device).detach()
-            )
-            correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(
-                device).detach()
+            unpert_probs = unpert_probs + SMALL_CONST * (unpert_probs <= SMALL_CONST).float().to(device).detach()
+            correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(device).detach()
            corrected_probs = probs + correction.detach()
-            kl_loss = kl_scale * (
-                (corrected_probs * (corrected_probs / unpert_probs).log()).sum()
-            )
-            print(' kl_loss', kl_loss.data.cpu().numpy())
+            kl_loss = kl_scale * ((corrected_probs * (corrected_probs / unpert_probs).log()).sum())
+            print(" kl_loss", kl_loss.data.cpu().numpy())
            loss += kl_loss

        loss_per_iter.append(loss.data.cpu().numpy())
-        print(' pplm_loss', (loss - kl_loss).data.cpu().numpy())
+        print(" pplm_loss", (loss - kl_loss).data.cpu().numpy())

        # compute gradients
        loss.backward()
@@ -259,15 +223,12 @@ def perturb_past(
            ]
        else:
            grad_norms = [
-                (torch.norm(p_.grad * window_mask) + SMALL_CONST)
-                for index, p_ in enumerate(curr_perturbation)
+                (torch.norm(p_.grad * window_mask) + SMALL_CONST) for index, p_ in enumerate(curr_perturbation)
            ]

        # normalize gradients
        grad = [
-            -stepsize *
-            (p_.grad * window_mask / grad_norms[
-                index] ** gamma).data.cpu().numpy()
+            -stepsize * (p_.grad * window_mask / grad_norms[index] ** gamma).data.cpu().numpy()
            for index, p_ in enumerate(curr_perturbation)
        ]

@@ -285,36 +246,27 @@ def perturb_past(
        past = new_past

    # apply the accumulated perturbations to the past
-    grad_accumulator = [
-        to_var(torch.from_numpy(p_), requires_grad=True, device=device)
-        for p_ in grad_accumulator
-    ]
+    grad_accumulator = [to_var(torch.from_numpy(p_), requires_grad=True, device=device) for p_ in grad_accumulator]
    pert_past = list(map(add, past, grad_accumulator))

    return pert_past, new_accumulated_hidden, grad_norms, loss_per_iter


 def get_classifier(
-        name: Optional[str], class_label: Union[str, int],
-        device: str
+    name: Optional[str], class_label: Union[str, int], device: str
 ) -> Tuple[Optional[ClassificationHead], Optional[int]]:
    if name is None:
        return None, None

    params = DISCRIMINATOR_MODELS_PARAMS[name]
-    classifier = ClassificationHead(
-        class_size=params['class_size'],
-        embed_size=params['embed_size']
-    ).to(device)
+    classifier = ClassificationHead(class_size=params["class_size"], embed_size=params["embed_size"]).to(device)
    if "url" in params:
        resolved_archive_file = cached_path(params["url"])
    elif "path" in params:
        resolved_archive_file = params["path"]
    else:
-        raise ValueError("Either url or path have to be specified "
-                         "in the discriminator model parameters")
-    classifier.load_state_dict(
-        torch.load(resolved_archive_file, map_location=device))
+        raise ValueError("Either url or path have to be specified " "in the discriminator model parameters")
+    classifier.load_state_dict(torch.load(resolved_archive_file, map_location=device))
    classifier.eval()

    if isinstance(class_label, str):
@@ -341,8 +293,7 @@ def get_classifier(
    return classifier, label_id


-def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> \
-        List[List[List[int]]]:
+def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> List[List[List[int]]]:
    bow_indices = []
    for id_or_path in bag_of_words_ids_or_paths:
        if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP:
@@ -351,13 +302,11 @@ def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) ->
            filepath = id_or_path
        with open(filepath, "r") as f:
            words = f.read().strip().split("\n")
-        bow_indices.append(
-            [tokenizer.encode(word.strip(), add_prefix_space=True) for word in
-             words])
+        bow_indices.append([tokenizer.encode(word.strip(), add_prefix_space=True) for word in words])
    return bow_indices


-def build_bows_one_hot_vectors(bow_indices, tokenizer, device='cuda'):
+def build_bows_one_hot_vectors(bow_indices, tokenizer, device="cuda"):
    if bow_indices is None:
        return None

@@ -373,39 +322,34 @@ def build_bows_one_hot_vectors(bow_indices, tokenizer, device='cuda'):


 def full_text_generation(
-        model,
-        tokenizer,
-        context=None,
-        num_samples=1,
-        device="cuda",
-        bag_of_words=None,
-        discrim=None,
-        class_label=None,
-        length=100,
-        stepsize=0.02,
-        temperature=1.0,
-        top_k=10,
-        sample=False,
-        num_iterations=3,
-        grad_length=10000,
-        horizon_length=1,
-        window_length=0,
-        decay=False,
-        gamma=1.5,
-        gm_scale=0.9,
-        kl_scale=0.01,
-        **kwargs
+    model,
+    tokenizer,
+    context=None,
+    num_samples=1,
+    device="cuda",
+    bag_of_words=None,
+    discrim=None,
+    class_label=None,
+    length=100,
+    stepsize=0.02,
+    temperature=1.0,
+    top_k=10,
+    sample=False,
+    num_iterations=3,
+    grad_length=10000,
+    horizon_length=1,
+    window_length=0,
+    decay=False,
+    gamma=1.5,
+    gm_scale=0.9,
+    kl_scale=0.01,
+    **kwargs
 ):
-    classifier, class_id = get_classifier(
-        discrim,
-        class_label,
-        device
-    )
+    classifier, class_id = get_classifier(discrim, class_label, device)

    bow_indices = []
    if bag_of_words:
-        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
-                                               tokenizer)
+        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer)

    if bag_of_words and classifier:
        print("Both PPLM-BoW and PPLM-Discrim are on. This is not optimized.")
@@ -423,15 +367,9 @@ def full_text_generation(
        raise Exception("Specify either a bag of words or a discriminator")

    unpert_gen_tok_text, _, _ = generate_text_pplm(
-        model=model,
-        tokenizer=tokenizer,
-        context=context,
-        device=device,
-        length=length,
-        sample=sample,
-        perturb=False
+        model=model, tokenizer=tokenizer, context=context, device=device, length=length, sample=sample, perturb=False
    )
-    if device == 'cuda':
+    if device == "cuda":
        torch.cuda.empty_cache()

    pert_gen_tok_texts = []
@@ -468,36 +406,36 @@ def full_text_generation(
            discrim_losses.append(discrim_loss.data.cpu().numpy())
        losses_in_time.append(loss_in_time)

-    if device == 'cuda':
+    if device == "cuda":
        torch.cuda.empty_cache()

    return unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time


 def generate_text_pplm(
-        model,
-        tokenizer,
-        context=None,
-        past=None,
-        device="cuda",
-        perturb=True,
-        bow_indices=None,
-        classifier=None,
-        class_label=None,
-        loss_type=0,
-        length=100,
-        stepsize=0.02,
-        temperature=1.0,
-        top_k=10,
-        sample=False,
-        num_iterations=3,
-        grad_length=10000,
-        horizon_length=1,
-        window_length=0,
-        decay=False,
-        gamma=1.5,
-        gm_scale=0.9,
-        kl_scale=0.01,
+    model,
+    tokenizer,
+    context=None,
+    past=None,
+    device="cuda",
+    perturb=True,
+    bow_indices=None,
+    classifier=None,
+    class_label=None,
+    loss_type=0,
+    length=100,
+    stepsize=0.02,
+    temperature=1.0,
+    top_k=10,
+    sample=False,
+    num_iterations=3,
+    grad_length=10000,
+    horizon_length=1,
+    window_length=0,
+    decay=False,
+    gamma=1.5,
+    gm_scale=0.9,
+    kl_scale=0.01,
 ):
    output_so_far = None
    if context:
@@ -507,8 +445,7 @@ def generate_text_pplm(
        output_so_far = context_t

    # collect one hot vectors for bags of words
-    one_hot_bows_vectors = build_bows_one_hot_vectors(bow_indices, tokenizer,
-                                                      device)
+    one_hot_bows_vectors = build_bows_one_hot_vectors(bow_indices, tokenizer, device)

    grad_norms = None
    last = None
@@ -575,13 +512,9 @@ def generate_text_pplm(
        if classifier is not None:
            ce_loss = torch.nn.CrossEntropyLoss()
            prediction = classifier(torch.mean(unpert_last_hidden, dim=1))
-            label = torch.tensor([class_label], device=device,
-                                 dtype=torch.long)
+            label = torch.tensor([class_label], device=device, dtype=torch.long)
            unpert_discrim_loss = ce_loss(prediction, label)
-            print(
-                "unperturbed discrim loss",
-                unpert_discrim_loss.data.cpu().numpy()
-            )
+            print("unperturbed discrim loss", unpert_discrim_loss.data.cpu().numpy())
        else:
            unpert_discrim_loss = 0

@@ -590,10 +523,8 @@ def generate_text_pplm(

            unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)

-            pert_probs = ((pert_probs ** gm_scale) * (
-                    unpert_probs ** (1 - gm_scale)))  # + SMALL_CONST
-            pert_probs = top_k_filter(pert_probs, k=top_k,
-                                      probs=True)  # + SMALL_CONST
+            pert_probs = (pert_probs ** gm_scale) * (unpert_probs ** (1 - gm_scale))  # + SMALL_CONST
+            pert_probs = top_k_filter(pert_probs, k=top_k, probs=True)  # + SMALL_CONST

            # rescale
            if torch.sum(pert_probs) <= 1:
@@ -611,10 +542,7 @@ def generate_text_pplm(
            _, last = torch.topk(pert_probs, k=1, dim=-1)

        # update context/output_so_far appending the new token
-        output_so_far = (
-            last if output_so_far is None
-            else torch.cat((output_so_far, last), dim=1)
-        )
+        output_so_far = last if output_so_far is None else torch.cat((output_so_far, last), dim=1)

        print(tokenizer.decode(output_so_far.tolist()[0]))

@@ -623,44 +551,42 @@ def generate_text_pplm(

 def set_generic_model_params(discrim_weights, discrim_meta):
    if discrim_weights is None:
-        raise ValueError('When using a generic discriminator, '
-                         'discrim_weights need to be specified')
+        raise ValueError("When using a generic discriminator, " "discrim_weights need to be specified")
    if discrim_meta is None:
-        raise ValueError('When using a generic discriminator, '
-                         'discrim_meta need to be specified')
+        raise ValueError("When using a generic discriminator, " "discrim_meta need to be specified")

-    with open(discrim_meta, 'r') as discrim_meta_file:
+    with open(discrim_meta, "r") as discrim_meta_file:
        meta = json.load(discrim_meta_file)
-    meta['path'] = discrim_weights
-    DISCRIMINATOR_MODELS_PARAMS['generic'] = meta
+    meta["path"] = discrim_weights
+    DISCRIMINATOR_MODELS_PARAMS["generic"] = meta


 def run_pplm_example(
-        pretrained_model="gpt2-medium",
-        cond_text="",
-        uncond=False,
-        num_samples=1,
-        bag_of_words=None,
-        discrim=None,
-        discrim_weights=None,
-        discrim_meta=None,
-        class_label=-1,
-        length=100,
-        stepsize=0.02,
-        temperature=1.0,
-        top_k=10,
-        sample=False,
-        num_iterations=3,
-        grad_length=10000,
-        horizon_length=1,
-        window_length=0,
-        decay=False,
-        gamma=1.5,
-        gm_scale=0.9,
-        kl_scale=0.01,
-        seed=0,
-        no_cuda=False,
-        colorama=False
+    pretrained_model="gpt2-medium",
+    cond_text="",
+    uncond=False,
+    num_samples=1,
+    bag_of_words=None,
+    discrim=None,
+    discrim_weights=None,
+    discrim_meta=None,
+    class_label=-1,
+    length=100,
+    stepsize=0.02,
+    temperature=1.0,
+    top_k=10,
+    sample=False,
+    num_iterations=3,
+    grad_length=10000,
+    horizon_length=1,
+    window_length=0,
+    decay=False,
+    gamma=1.5,
+    gm_scale=0.9,
+    kl_scale=0.01,
+    seed=0,
+    no_cuda=False,
+    colorama=False,
 ):
    # set Random seed
    torch.manual_seed(seed)
@@ -669,21 +595,15 @@ def run_pplm_example(
    # set the device
    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"

-    if discrim == 'generic':
+    if discrim == "generic":
        set_generic_model_params(discrim_weights, discrim_meta)

    if discrim is not None:
-        pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][
-            "pretrained_model"
-        ]
-        print("discrim = {}, pretrained_model set "
-              "to discriminator's = {}".format(discrim, pretrained_model))
+        pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim]["pretrained_model"]
+        print("discrim = {}, pretrained_model set " "to discriminator's = {}".format(discrim, pretrained_model))

    # load pretrained model
-    model = GPT2LMHeadModel.from_pretrained(
-        pretrained_model,
-        output_hidden_states=True
-    )
+    model = GPT2LMHeadModel.from_pretrained(pretrained_model, output_hidden_states=True)
    model.to(device)
    model.eval()

@@ -696,9 +616,7 @@ def run_pplm_example(

    # figure out conditioning text
    if uncond:
-        tokenized_cond_text = tokenizer.encode(
-            [tokenizer.bos_token]
-        )
+        tokenized_cond_text = tokenizer.encode([tokenizer.bos_token])
    else:
        raw_text = cond_text
        while not raw_text:
@@ -750,8 +668,7 @@ def run_pplm_example(

    bow_word_ids = set()
    if bag_of_words and colorama:
-        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
-                                               tokenizer)
+        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer)
        for single_bow_list in bow_indices:
            # filtering all words in the list composed of more than 1 token
            filtered = list(filter(lambda x: len(x) <= 1, single_bow_list))
@@ -765,13 +682,11 @@ def run_pplm_example(
            if colorama:
                import colorama

-                pert_gen_text = ''
+                pert_gen_text = ""
                for word_id in pert_gen_tok_text.tolist()[0]:
                    if word_id in bow_word_ids:
-                        pert_gen_text += '{}{}{}'.format(
-                            colorama.Fore.RED,
-                            tokenizer.decode([word_id]),
-                            colorama.Style.RESET_ALL
+                        pert_gen_text += "{}{}{}".format(
+                            colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL
                        )
                    else:
                        pert_gen_text += tokenizer.decode([word_id])
@@ -785,14 +700,12 @@ def run_pplm_example(
            pass

        # keep the prefix, perturbed seq, original seq for each index
-        generated_texts.append(
-            (tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text)
-        )
+        generated_texts.append((tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text))

    return


-if __name__ == '__main__':
+if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--pretrained_model",
@@ -801,19 +714,10 @@ if __name__ == '__main__':
        default="gpt2-medium",
        help="pretrained model name or path to local checkpoint",
    )
+    parser.add_argument("--cond_text", type=str, default="The lake", help="Prefix texts to condition on")
+    parser.add_argument("--uncond", action="store_true", help="Generate from end-of-text as prefix")
    parser.add_argument(
-        "--cond_text", type=str, default="The lake",
-        help="Prefix texts to condition on"
-    )
-    parser.add_argument(
-        "--uncond", action="store_true",
-        help="Generate from end-of-text as prefix"
-    )
-    parser.add_argument(
-        "--num_samples",
-        type=int,
-        default=1,
-        help="Number of samples to generate from the modified latents",
+        "--num_samples", type=int, default=1, help="Number of samples to generate from the modified latents",
    )
    parser.add_argument(
        "--bag_of_words",
@@ -821,8 +725,8 @@ if __name__ == '__main__':
        type=str,
        default=None,
        help="Bags of words used for PPLM-BoW. "
-             "Either a BOW id (see list in code) or a filepath. "
-             "Multiple BoWs separated by ;",
+        "Either a BOW id (see list in code) or a filepath. "
+        "Multiple BoWs separated by ;",
    )
    parser.add_argument(
        "--discrim",
@@ -832,48 +736,36 @@ if __name__ == '__main__':
        choices=("clickbait", "sentiment", "toxicity", "generic"),
        help="Discriminator to use",
    )
-    parser.add_argument('--discrim_weights', type=str, default=None,
-                        help='Weights for the generic discriminator')
-    parser.add_argument('--discrim_meta', type=str, default=None,
-                        help='Meta information for the generic discriminator')
+    parser.add_argument("--discrim_weights", type=str, default=None, help="Weights for the generic discriminator")
    parser.add_argument(
-        "--class_label",
-        type=int,
-        default=-1,
-        help="Class label used for the discriminator",
+        "--discrim_meta", type=str, default=None, help="Meta information for the generic discriminator"
+    )
+    parser.add_argument(
+        "--class_label", type=int, default=-1, help="Class label used for the discriminator",
    )
    parser.add_argument("--length", type=int, default=100)
    parser.add_argument("--stepsize", type=float, default=0.02)
    parser.add_argument("--temperature", type=float, default=1.0)
    parser.add_argument("--top_k", type=int, default=10)
-    parser.add_argument(
-        "--sample", action="store_true",
-        help="Generate from end-of-text as prefix"
-    )
+    parser.add_argument("--sample", action="store_true", help="Generate from end-of-text as prefix")
    parser.add_argument("--num_iterations", type=int, default=3)
    parser.add_argument("--grad_length", type=int, default=10000)
    parser.add_argument(
        "--window_length",
        type=int,
        default=0,
-        help="Length of past which is being optimized; "
-             "0 corresponds to infinite window length",
+        help="Length of past which is being optimized; " "0 corresponds to infinite window length",
    )
    parser.add_argument(
-        "--horizon_length",
-        type=int,
-        default=1,
-        help="Length of future to optimize over",
+        "--horizon_length", type=int, default=1, help="Length of future to optimize over",
    )
-    parser.add_argument("--decay", action="store_true",
-                        help="whether to decay or not")
+    parser.add_argument("--decay", action="store_true", help="whether to decay or not")
    parser.add_argument("--gamma", type=float, default=1.5)
    parser.add_argument("--gm_scale", type=float, default=0.9)
    parser.add_argument("--kl_scale", type=float, default=0.01)
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--no_cuda", action="store_true", help="no cuda")
-    parser.add_argument("--colorama", action="store_true",
-                        help="colors keywords")
+    parser.add_argument("--colorama", action="store_true", help="colors keywords")

    args = parser.parse_args()
    run_pplm_example(**vars(args))
--- a/examples/pplm/run_pplm_discrim_train.py
+++ b/examples/pplm/run_pplm_discrim_train.py
@@ -1,19 +1,19 @@
 #! /usr/bin/env python3
 # coding=utf-8

-#Copyright (c) 2019 Uber Technologies, Inc.
+# Copyright (c) 2019 Uber Technologies, Inc.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 import argparse
 import csv
@@ -42,26 +42,15 @@ example_sentence = "This is incredible! I love it, this is the best chicken I ha
 max_length_seq = 100


-
-
 class Discriminator(torch.nn.Module):
    """Transformer encoder followed by a Classification Head"""

-    def __init__(
-            self,
-            class_size,
-            pretrained_model="gpt2-medium",
-            cached_mode=False,
-            device='cpu'
-    ):
+    def __init__(self, class_size, pretrained_model="gpt2-medium", cached_mode=False, device="cpu"):
        super(Discriminator, self).__init__()
        self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
        self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
        self.embed_size = self.encoder.transformer.config.hidden_size
-        self.classifier_head = ClassificationHead(
-            class_size=class_size,
-            embed_size=self.embed_size
-        )
+        self.classifier_head = ClassificationHead(class_size=class_size, embed_size=self.embed_size)
        self.cached_mode = cached_mode
        self.device = device

@@ -74,14 +63,10 @@ class Discriminator(torch.nn.Module):
        self.classifier_head.train()

    def avg_representation(self, x):
-        mask = x.ne(0).unsqueeze(2).repeat(
-            1, 1, self.embed_size
-        ).float().to(self.device).detach()
+        mask = x.ne(0).unsqueeze(2).repeat(1, 1, self.embed_size).float().to(self.device).detach()
        hidden, _ = self.encoder.transformer(x)
        masked_hidden = hidden * mask
-        avg_hidden = torch.sum(masked_hidden, dim=1) / (
-                torch.sum(mask, dim=1).detach() + EPSILON
-        )
+        avg_hidden = torch.sum(masked_hidden, dim=1) / (torch.sum(mask, dim=1).detach() + EPSILON)
        return avg_hidden

    def forward(self, x):
@@ -117,10 +102,7 @@ def collate_fn(data):
    def pad_sequences(sequences):
        lengths = [len(seq) for seq in sequences]

-        padded_sequences = torch.zeros(
-            len(sequences),
-            max(lengths)
-        ).long()  # padding value = 0
+        padded_sequences = torch.zeros(len(sequences), max(lengths)).long()  # padding value = 0

        for i, seq in enumerate(sequences):
            end = lengths[i]
@@ -149,8 +131,7 @@ def cached_collate_fn(data):
    return x_batch, y_batch


-def train_epoch(data_loader, discriminator, optimizer,
-                epoch=0, log_interval=10, device='cpu'):
+def train_epoch(data_loader, discriminator, optimizer, epoch=0, log_interval=10, device="cpu"):
    samples_so_far = 0
    discriminator.train_custom()
    for batch_idx, (input_t, target_t) in enumerate(data_loader):
@@ -169,13 +150,15 @@ def train_epoch(data_loader, discriminator, optimizer,
            print(
                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                    epoch + 1,
-                    samples_so_far, len(data_loader.dataset),
-                    100 * samples_so_far / len(data_loader.dataset), loss.item()
+                    samples_so_far,
+                    len(data_loader.dataset),
+                    100 * samples_so_far / len(data_loader.dataset),
+                    loss.item(),
                )
            )


-def evaluate_performance(data_loader, discriminator, device='cpu'):
+def evaluate_performance(data_loader, discriminator, device="cpu"):
    discriminator.eval()
    test_loss = 0
    correct = 0
@@ -194,13 +177,12 @@ def evaluate_performance(data_loader, discriminator, device='cpu'):
    print(
        "Performance on test set: "
        "Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)".format(
-            test_loss, correct, len(data_loader.dataset),
-            100. * correct / len(data_loader.dataset)
+            test_loss, correct, len(data_loader.dataset), 100.0 * correct / len(data_loader.dataset)
        )
    )


-def predict(input_sentence, model, classes, cached=False, device='cpu'):
+def predict(input_sentence, model, classes, cached=False, device="cpu"):
    input_t = model.tokenizer.encode(input_sentence)
    input_t = torch.tensor([input_t], dtype=torch.long, device=device)
    if cached:
@@ -208,17 +190,14 @@ def predict(input_sentence, model, classes, cached=False, device='cpu'):

    log_probs = model(input_t).data.cpu().numpy().flatten().tolist()
    print("Input sentence:", input_sentence)
-    print("Predictions:", ", ".join(
-        "{}: {:.4f}".format(c, math.exp(log_prob)) for c, log_prob in
-        zip(classes, log_probs)
-    ))
+    print(
+        "Predictions:",
+        ", ".join("{}: {:.4f}".format(c, math.exp(log_prob)) for c, log_prob in zip(classes, log_probs)),
+    )


-def get_cached_data_loader(dataset, batch_size, discriminator,
-                           shuffle=False, device='cpu'):
-    data_loader = torch.utils.data.DataLoader(dataset=dataset,
-                                              batch_size=batch_size,
-                                              collate_fn=collate_fn)
+def get_cached_data_loader(dataset, batch_size, discriminator, shuffle=False, device="cpu"):
+    data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=collate_fn)

    xs = []
    ys = []
@@ -231,50 +210,44 @@ def get_cached_data_loader(dataset, batch_size, discriminator,
            ys += y.cpu().numpy().tolist()

    data_loader = torch.utils.data.DataLoader(
-        dataset=Dataset(xs, ys),
-        batch_size=batch_size,
-        shuffle=shuffle,
-        collate_fn=cached_collate_fn)
+        dataset=Dataset(xs, ys), batch_size=batch_size, shuffle=shuffle, collate_fn=cached_collate_fn
+    )

    return data_loader


 def train_discriminator(
-        dataset, dataset_fp=None, pretrained_model="gpt2-medium",
-        epochs=10, batch_size=64, log_interval=10,
-        save_model=False, cached=False, no_cuda=False):
+    dataset,
+    dataset_fp=None,
+    pretrained_model="gpt2-medium",
+    epochs=10,
+    batch_size=64,
+    log_interval=10,
+    save_model=False,
+    cached=False,
+    no_cuda=False,
+):
    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"

    print("Preprocessing {} dataset...".format(dataset))
    start = time.time()

    if dataset == "SST":
-        idx2class = ["positive", "negative", "very positive", "very negative",
-                     "neutral"]
+        idx2class = ["positive", "negative", "very positive", "very negative", "neutral"]
        class2idx = {c: i for i, c in enumerate(idx2class)}

        discriminator = Discriminator(
-            class_size=len(idx2class),
-            pretrained_model=pretrained_model,
-            cached_mode=cached,
-            device=device
+            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
        ).to(device)

        text = torchtext_data.Field()
        label = torchtext_data.Field(sequential=False)
-        train_data, val_data, test_data = datasets.SST.splits(
-            text,
-            label,
-            fine_grained=True,
-            train_subtrees=True,
-        )
+        train_data, val_data, test_data = datasets.SST.splits(text, label, fine_grained=True, train_subtrees=True,)

        x = []
        y = []
        for i in trange(len(train_data), ascii=True):
-            seq = TreebankWordDetokenizer().detokenize(
-                vars(train_data[i])["text"]
-            )
+            seq = TreebankWordDetokenizer().detokenize(vars(train_data[i])["text"])
            seq = discriminator.tokenizer.encode(seq)
            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
            x.append(seq)
@@ -284,9 +257,7 @@ def train_discriminator(
        test_x = []
        test_y = []
        for i in trange(len(test_data), ascii=True):
-            seq = TreebankWordDetokenizer().detokenize(
-                vars(test_data[i])["text"]
-            )
+            seq = TreebankWordDetokenizer().detokenize(vars(test_data[i])["text"])
            seq = discriminator.tokenizer.encode(seq)
            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
            test_x.append(seq)
@@ -306,10 +277,7 @@ def train_discriminator(
        class2idx = {c: i for i, c in enumerate(idx2class)}

        discriminator = Discriminator(
-            class_size=len(idx2class),
-            pretrained_model=pretrained_model,
-            cached_mode=cached,
-            device=device
+            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
        ).to(device)

        with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
@@ -318,9 +286,7 @@ def train_discriminator(
                try:
                    data.append(eval(line))
                except:
-                    print("Error evaluating line {}: {}".format(
-                        i, line
-                    ))
+                    print("Error evaluating line {}: {}".format(i, line))
                    continue
        x = []
        y = []
@@ -331,27 +297,20 @@ def train_discriminator(
                    seq = discriminator.tokenizer.encode(d["text"])

                    if len(seq) < max_length_seq:
-                        seq = torch.tensor(
-                            [50256] + seq, device=device, dtype=torch.long
-                        )
+                        seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
                    else:
-                        print("Line {} is longer than maximum length {}".format(
-                            i, max_length_seq
-                        ))
+                        print("Line {} is longer than maximum length {}".format(i, max_length_seq))
                        continue
                    x.append(seq)
                    y.append(d["label"])
                except:
-                    print("Error evaluating / tokenizing"
-                          " line {}, skipping it".format(i))
+                    print("Error evaluating / tokenizing" " line {}, skipping it".format(i))
                    pass

        full_dataset = Dataset(x, y)
        train_size = int(0.9 * len(full_dataset))
        test_size = len(full_dataset) - train_size
-        train_dataset, test_dataset = torch.utils.data.random_split(
-            full_dataset, [train_size, test_size]
-        )
+        train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])

        discriminator_meta = {
            "class_size": len(idx2class),
@@ -366,10 +325,7 @@ def train_discriminator(
        class2idx = {c: i for i, c in enumerate(idx2class)}

        discriminator = Discriminator(
-            class_size=len(idx2class),
-            pretrained_model=pretrained_model,
-            cached_mode=cached,
-            device=device
+            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
        ).to(device)

        x = []
@@ -381,27 +337,20 @@ def train_discriminator(
                    seq = discriminator.tokenizer.encode(d["text"])

                    if len(seq) < max_length_seq:
-                        seq = torch.tensor(
-                            [50256] + seq, device=device, dtype=torch.long
-                        )
+                        seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
                    else:
-                        print("Line {} is longer than maximum length {}".format(
-                            i, max_length_seq
-                        ))
+                        print("Line {} is longer than maximum length {}".format(i, max_length_seq))
                        continue
                    x.append(seq)
                    y.append(int(np.sum(d["label"]) > 0))
                except:
-                    print("Error evaluating / tokenizing"
-                          " line {}, skipping it".format(i))
+                    print("Error evaluating / tokenizing" " line {}, skipping it".format(i))
                    pass

        full_dataset = Dataset(x, y)
        train_size = int(0.9 * len(full_dataset))
        test_size = len(full_dataset) - train_size
-        train_dataset, test_dataset = torch.utils.data.random_split(
-            full_dataset, [train_size, test_size]
-        )
+        train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])

        discriminator_meta = {
            "class_size": len(idx2class),
@@ -416,8 +365,7 @@ def train_discriminator(
        # class \t text

        if dataset_fp is None:
-            raise ValueError("When generic dataset is selected, "
-                             "dataset_fp needs to be specified aswell.")
+            raise ValueError("When generic dataset is selected, " "dataset_fp needs to be specified aswell.")

        classes = set()
        with open(dataset_fp) as f:
@@ -430,10 +378,7 @@ def train_discriminator(
        class2idx = {c: i for i, c in enumerate(idx2class)}

        discriminator = Discriminator(
-            class_size=len(idx2class),
-            pretrained_model=pretrained_model,
-            cached_mode=cached,
-            device=device
+            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
        ).to(device)

        x = []
@@ -447,18 +392,11 @@ def train_discriminator(

                    try:
                        seq = discriminator.tokenizer.encode(text)
-                        if (len(seq) < max_length_seq):
-                            seq = torch.tensor(
-                                [50256] + seq,
-                                device=device,
-                                dtype=torch.long
-                            )
+                        if len(seq) < max_length_seq:
+                            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)

                        else:
-                            print(
-                                "Line {} is longer than maximum length {}".format(
-                                    i, max_length_seq
-                                ))
+                            print("Line {} is longer than maximum length {}".format(i, max_length_seq))
                            continue

                        x.append(seq)
@@ -471,10 +409,7 @@ def train_discriminator(
        full_dataset = Dataset(x, y)
        train_size = int(0.9 * len(full_dataset))
        test_size = len(full_dataset) - train_size
-        train_dataset, test_dataset = torch.utils.data.random_split(
-            full_dataset,
-            [train_size, test_size]
-        )
+        train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])

        discriminator_meta = {
            "class_size": len(idx2class),
@@ -485,9 +420,7 @@ def train_discriminator(
        }

    end = time.time()
-    print("Preprocessed {} data points".format(
-        len(train_dataset) + len(test_dataset))
-    )
+    print("Preprocessed {} data points".format(len(train_dataset) + len(test_dataset)))
    print("Data preprocessing took: {:.3f}s".format(end - start))

    if cached:
@@ -495,30 +428,21 @@ def train_discriminator(

        start = time.time()

-        train_loader = get_cached_data_loader(
-            train_dataset, batch_size, discriminator,
-            shuffle=True, device=device
-        )
+        train_loader = get_cached_data_loader(train_dataset, batch_size, discriminator, shuffle=True, device=device)

-        test_loader = get_cached_data_loader(
-            test_dataset, batch_size, discriminator, device=device
-        )
+        test_loader = get_cached_data_loader(test_dataset, batch_size, discriminator, device=device)

        end = time.time()
        print("Building representation cache took: {:.3f}s".format(end - start))

    else:
-        train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
-                                                   batch_size=batch_size,
-                                                   shuffle=True,
-                                                   collate_fn=collate_fn)
-        test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
-                                                  batch_size=batch_size,
-                                                  collate_fn=collate_fn)
+        train_loader = torch.utils.data.DataLoader(
+            dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
+        )
+        test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, collate_fn=collate_fn)

    if save_model:
-        with open("{}_classifier_head_meta.json".format(dataset),
-                  "w") as meta_file:
+        with open("{}_classifier_head_meta.json".format(dataset), "w") as meta_file:
            json.dump(discriminator_meta, meta_file)

    optimizer = optim.Adam(discriminator.parameters(), lr=0.0001)
@@ -533,56 +457,61 @@ def train_discriminator(
            optimizer=optimizer,
            epoch=epoch,
            log_interval=log_interval,
-            device=device
-        )
-        evaluate_performance(
-            data_loader=test_loader,
-            discriminator=discriminator,
-            device=device
+            device=device,
        )
+        evaluate_performance(data_loader=test_loader, discriminator=discriminator, device=device)

        end = time.time()
        print("Epoch took: {:.3f}s".format(end - start))

        print("\nExample prediction")
-        predict(example_sentence, discriminator, idx2class,
-                cached=cached, device=device)
+        predict(example_sentence, discriminator, idx2class, cached=cached, device=device)

        if save_model:
            # torch.save(discriminator.state_dict(),
            #           "{}_discriminator_{}.pt".format(
            #               args.dataset, epoch + 1
            #               ))
-            torch.save(discriminator.get_classifier().state_dict(),
-                       "{}_classifier_head_epoch_{}.pt".format(dataset,
-                                                               epoch + 1))
+            torch.save(
+                discriminator.get_classifier().state_dict(),
+                "{}_classifier_head_epoch_{}.pt".format(dataset, epoch + 1),
+            )


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Train a discriminator on top of GPT-2 representations")
-    parser.add_argument("--dataset", type=str, default="SST",
-                        choices=("SST", "clickbait", "toxic", "generic"),
-                        help="dataset to train the discriminator on."
-                             "In case of generic, the dataset is expected"
-                             "to be a TSBV file with structure: class \\t text")
-    parser.add_argument("--dataset_fp", type=str, default="",
-                        help="File path of the dataset to use. "
-                             "Needed only in case of generic datadset")
-    parser.add_argument("--pretrained_model", type=str, default="gpt2-medium",
-                        help="Pretrained model to use as encoder")
-    parser.add_argument("--epochs", type=int, default=10, metavar="N",
-                        help="Number of training epochs")
-    parser.add_argument("--batch_size", type=int, default=64, metavar="N",
-                        help="input batch size for training (default: 64)")
-    parser.add_argument("--log_interval", type=int, default=10, metavar="N",
-                        help="how many batches to wait before logging training status")
-    parser.add_argument("--save_model", action="store_true",
-                        help="whether to save the model")
-    parser.add_argument("--cached", action="store_true",
-                        help="whether to cache the input representations")
-    parser.add_argument("--no_cuda", action="store_true",
-                        help="use to turn off cuda")
+    parser = argparse.ArgumentParser(description="Train a discriminator on top of GPT-2 representations")
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="SST",
+        choices=("SST", "clickbait", "toxic", "generic"),
+        help="dataset to train the discriminator on."
+        "In case of generic, the dataset is expected"
+        "to be a TSBV file with structure: class \\t text",
+    )
+    parser.add_argument(
+        "--dataset_fp",
+        type=str,
+        default="",
+        help="File path of the dataset to use. " "Needed only in case of generic datadset",
+    )
+    parser.add_argument(
+        "--pretrained_model", type=str, default="gpt2-medium", help="Pretrained model to use as encoder"
+    )
+    parser.add_argument("--epochs", type=int, default=10, metavar="N", help="Number of training epochs")
+    parser.add_argument(
+        "--batch_size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)"
+    )
+    parser.add_argument(
+        "--log_interval",
+        type=int,
+        default=10,
+        metavar="N",
+        help="how many batches to wait before logging training status",
+    )
+    parser.add_argument("--save_model", action="store_true", help="whether to save the model")
+    parser.add_argument("--cached", action="store_true", help="whether to cache the input representations")
+    parser.add_argument("--no_cuda", action="store_true", help="use to turn off cuda")
    args = parser.parse_args()

    train_discriminator(**(vars(args)))
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -32,10 +32,18 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse
 from torch.utils.data.distributed import DistributedSampler
 from torch.nn import CrossEntropyLoss, MSELoss

-from transformers import (WEIGHTS_NAME,
-                                  BertConfig, BertForSequenceClassification, BertTokenizer,
-                                  XLMConfig, XLMForSequenceClassification, XLMTokenizer,
-                                  XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForSequenceClassification,
+    BertTokenizer,
+    XLMConfig,
+    XLMForSequenceClassification,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetForSequenceClassification,
+    XLNetTokenizer,
+)

 from run_glue import set_seed, load_and_cache_examples, ALL_MODELS, MODEL_CLASSES

@@ -63,7 +71,9 @@ def print_2d_tensor(tensor):
            logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:d}" for x in tensor[row].cpu().data))


-def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None):
+def compute_heads_importance(
+    args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None
+):
    """ This method shows how to compute:
        - head attention entropy
        - head importance scores according to http://arxiv.org/abs/1905.10650
@@ -85,8 +95,14 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
        input_ids, input_mask, segment_ids, label_ids = batch

        # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
-        outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask)
-        loss, logits, all_attentions = outputs[0], outputs[1], outputs[-1]  # Loss and logits are the first, attention the last
+        outputs = model(
+            input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask
+        )
+        loss, logits, all_attentions = (
+            outputs[0],
+            outputs[1],
+            outputs[-1],
+        )  # Loss and logits are the first, attention the last
        loss.backward()  # Backpropagate to populate the gradients in the head mask

        if compute_entropy:
@@ -113,15 +129,15 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
    # Layerwise importance normalization
    if not args.dont_normalize_importance_by_layer:
        exponent = 2
-        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1/exponent)
+        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1 / exponent)
        head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20

    if not args.dont_normalize_global_importance:
        head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())

    # Print/save matrices
-    np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy.detach().cpu().numpy())
-    np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance.detach().cpu().numpy())
+    np.save(os.path.join(args.output_dir, "attn_entropy.npy"), attn_entropy.detach().cpu().numpy())
+    np.save(os.path.join(args.output_dir, "head_importance.npy"), head_importance.detach().cpu().numpy())

    logger.info("Attention entropies")
    print_2d_tensor(attn_entropy)
@@ -129,7 +145,9 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
    print_2d_tensor(head_importance)
    logger.info("Head ranked by importance scores")
    head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
-    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel(), device=args.device)
+    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(
+        head_importance.numel(), device=args.device
+    )
    head_ranks = head_ranks.view_as(head_importance)
    print_2d_tensor(head_ranks)

@@ -150,9 +168,9 @@ def mask_heads(args, model, eval_dataloader):

    current_score = original_score
    while current_score >= original_score * args.masking_threshold:
-        head_mask = new_head_mask.clone() # save current head mask
+        head_mask = new_head_mask.clone()  # save current head mask
        # heads from least important to most - keep only not-masked heads
-        head_importance[head_mask == 0.0] = float('Inf')
+        head_importance[head_mask == 0.0] = float("Inf")
        current_heads_to_mask = head_importance.view(-1).sort()[1]

        if len(current_heads_to_mask) <= num_to_mask:
@@ -167,14 +185,21 @@ def mask_heads(args, model, eval_dataloader):
        print_2d_tensor(new_head_mask)

        # Compute metric and head importance again
-        _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask)
+        _, head_importance, preds, labels = compute_heads_importance(
+            args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask
+        )
        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
        current_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
-        logger.info("Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum()/new_head_mask.numel() * 100)
+        logger.info(
+            "Masking: current score: %f, remaning heads %d (%.1f percents)",
+            current_score,
+            new_head_mask.sum(),
+            new_head_mask.sum() / new_head_mask.numel() * 100,
+        )

    logger.info("Final head mask")
    print_2d_tensor(head_mask)
-    np.save(os.path.join(args.output_dir, 'head_mask.npy'), head_mask.detach().cpu().numpy())
+    np.save(os.path.join(args.output_dir, "head_mask.npy"), head_mask.detach().cpu().numpy())

    return head_mask

@@ -186,8 +211,9 @@ def prune_heads(args, model, eval_dataloader, head_mask):
    # Try pruning and test time speedup
    # Pruning is like masking but we actually remove the masked weights
    before_time = datetime.now()
-    _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
-                                                   compute_entropy=False, compute_importance=False, head_mask=head_mask)
+    _, _, preds, labels = compute_heads_importance(
+        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask
+    )
    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
    score_masking = compute_metrics(args.task_name, preds, labels)[args.metric_name]
    original_time = datetime.now() - before_time
@@ -199,73 +225,127 @@ def prune_heads(args, model, eval_dataloader, head_mask):
    pruned_num_params = sum(p.numel() for p in model.parameters())

    before_time = datetime.now()
-    _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
-                                                    compute_entropy=False, compute_importance=False, head_mask=None)
+    _, _, preds, labels = compute_heads_importance(
+        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=None
+    )
    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
    score_pruning = compute_metrics(args.task_name, preds, labels)[args.metric_name]
    new_time = datetime.now() - before_time

-    logger.info("Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100)
+    logger.info(
+        "Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)",
+        original_num_params,
+        pruned_num_params,
+        pruned_num_params / original_num_params * 100,
+    )
    logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
-    logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100)
+    logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time / new_time * 100)


 def main():
    parser = argparse.ArgumentParser()
    ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
-                            ALL_MODELS))
-    parser.add_argument("--task_name", default=None, type=str, required=True,
-                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )

    ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name_or_path")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name_or_path")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--data_subset", type=int, default=-1,
-                        help="If > 0: limit the data to a subset of data_subset instances.")
-    parser.add_argument("--overwrite_output_dir", action='store_true',
-                        help="Whether to overwrite data in output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument(
+        "--config_name",
+        default="",
+        type=str,
+        help="Pretrained config name or path if not the same as model_name_or_path",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name_or_path",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances."
+    )
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Whether to overwrite data in output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )

-    parser.add_argument("--dont_normalize_importance_by_layer", action='store_true',
-                        help="Don't normalize importance score by layers")
-    parser.add_argument("--dont_normalize_global_importance", action='store_true',
-                        help="Don't normalize all importance scores between 0 and 1")
+    parser.add_argument(
+        "--dont_normalize_importance_by_layer", action="store_true", help="Don't normalize importance score by layers"
+    )
+    parser.add_argument(
+        "--dont_normalize_global_importance",
+        action="store_true",
+        help="Don't normalize all importance scores between 0 and 1",
+    )

-    parser.add_argument("--try_masking", action='store_true',
-                        help="Whether to try to mask head until a threshold of accuracy.")
-    parser.add_argument("--masking_threshold", default=0.9, type=float,
-                        help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).")
-    parser.add_argument("--masking_amount", default=0.1, type=float,
-                        help="Amount to heads to masking at each masking step.")
-    parser.add_argument("--metric_name", default="acc", type=str,
-                        help="Metric to use for head masking.")
+    parser.add_argument(
+        "--try_masking", action="store_true", help="Whether to try to mask head until a threshold of accuracy."
+    )
+    parser.add_argument(
+        "--masking_threshold",
+        default=0.9,
+        type=float,
+        help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).",
+    )
+    parser.add_argument(
+        "--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step."
+    )
+    parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.")

-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. \n"
-                             "Sequences longer than this will be truncated, sequences shorter padded.")
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after WordPiece tokenization. \n"
+        "Sequences longer than this will be truncated, sequences shorter padded.",
+    )
    parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")

    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-    parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
+
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -278,10 +358,10 @@ def main():
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        args.n_gpu = 1
-        torch.distributed.init_process_group(backend='nccl')  # Initializes the distributed backend
+        torch.distributed.init_process_group(backend="nccl")  # Initializes the distributed backend

    # Setup logging
-    logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1)))

    # Set seeds
@@ -306,17 +386,23 @@ def main():
            args.model_type = key  # take the first match in model types
            break
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels,
-                                          finetuning_task=args.task_name,
-                                          output_attentions=True,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        output_attentions=True,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -324,14 +410,14 @@ def main():
    # Distributed and parallel training
    model.to(args.device)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Print/save training arguments
-    torch.save(args, os.path.join(args.output_dir, 'run_args.bin'))
+    torch.save(args, os.path.join(args.output_dir, "run_args.bin"))
    logger.info("Training/evaluation parameters %s", args)

    # Prepare dataset for the GLUE task
@@ -341,11 +427,9 @@ def main():
    eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)

-
    # Compute head entropy and importance score
    compute_heads_importance(args, model, eval_dataloader)

-
    # Try head masking (set heads to zero until the score goes under a threshole)
    # and head pruning (remove masked heads and see the effect on the network)
    if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
@@ -353,5 +437,5 @@ def main():
        prune_heads(args, model, eval_dataloader, head_mask)


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -33,9 +33,7 @@ from transformers import XLMWithLMHeadModel, XLMTokenizer


 logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO,
 )
 logger = logging.getLogger(__name__)

@@ -71,6 +69,7 @@ def set_seed(args):
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

+
 #
 # Functions to prepare models' input
 #
@@ -78,15 +77,11 @@ def set_seed(args):

 def prepare_ctrl_input(args, _, tokenizer, prompt_text):
    if args.temperature > 0.7:
-        logger.info(
-            "CTRL typically works better with lower temperatures (and lower top_k)."
-        )
+        logger.info("CTRL typically works better with lower temperatures (and lower top_k).")

    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False)
    if not any(encoded_prompt[0] == x for x in tokenizer.control_codes.values()):
-        logger.info(
-            "WARNING! You are not starting your generation from a control code so you won't get good results"
-        )
+        logger.info("WARNING! You are not starting your generation from a control code so you won't get good results")
    return prompt_text


@@ -102,11 +97,7 @@ def prepare_xlm_input(args, model, tokenizer, prompt_text):
        else:
            language = None
            while language not in available_languages:
-                language = input(
-                    "Using XLM. Select language in "
-                    + str(list(available_languages))
-                    + " >>> "
-                )
+                language = input("Using XLM. Select language in " + str(list(available_languages)) + " >>> ")
        # kwargs["language"] = tokenizer.lang2id[language]

    # TODO fix mask_token_id setup when configurations will be synchronized between models and tokenizers
@@ -148,17 +139,34 @@ def adjust_length_to_model(length, max_sequence_length):

 def main():
    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )

    parser.add_argument("--prompt", type=str, default="")
    parser.add_argument("--length", type=int, default=20)
    parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped")

-    parser.add_argument("--temperature", type=float, default=1.0, help="temperature of 1.0 has no effect, lower tend toward greedy sampling")
-    parser.add_argument("--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2")
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=1.0,
+        help="temperature of 1.0 has no effect, lower tend toward greedy sampling",
+    )
+    parser.add_argument(
+        "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2"
+    )
    parser.add_argument("--k", type=int, default=0)
    parser.add_argument("--p", type=float, default=0.9)

@@ -169,9 +177,7 @@ def main():
    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    args = parser.parse_args()

-    args.device = torch.device(
-        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
-    )
+    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()

    set_seed(args)
@@ -181,17 +187,13 @@ def main():
        args.model_type = args.model_type.lower()
        model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    except KeyError:
-        raise KeyError(
-            "the model {} you specified is not supported. You are welcome to add it and open a PR :)"
-        )
+        raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")

    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    model = model_class.from_pretrained(args.model_name_or_path)
    model.to(args.device)

-    args.length = adjust_length_to_model(
-        args.length, max_sequence_length=model.config.max_position_embeddings
-    )
+    args.length = adjust_length_to_model(args.length, max_sequence_length=model.config.max_position_embeddings)
    logger.info(args)

    prompt_text = args.prompt if args.prompt else input("Model prompt >>> ")
@@ -201,7 +203,7 @@ def main():
    if requires_preprocessing:
        prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type)
        prompt_text = prepare_input(args, model, tokenizer, prompt_text)
-    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors='pt')
+    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")

    output_sequences = model.generate(
        input_ids=encoded_prompt,
@@ -212,7 +214,7 @@ def main():
        repetition_penalty=args.repetition_penalty,
    )

-    # Batch size == 1. to add more examples please use num_return_sequences > 1 
+    # Batch size == 1. to add more examples please use num_return_sequences > 1
    generated_sequence = output_sequences[0].tolist()
    text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
    text = text[: t.find(args.stop_token) if args.stop_token else None]
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -26,8 +26,7 @@ import json

 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler

 try:
@@ -37,25 +36,30 @@ except:

 from tqdm import tqdm, trange

-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForSequenceClassification, BertTokenizer,
-                                  RobertaConfig,
-                                  RobertaForSequenceClassification,
-                                  RobertaTokenizer,
-                                  XLMConfig, XLMForSequenceClassification,
-                                  XLMTokenizer, XLNetConfig,
-                                  XLNetForSequenceClassification,
-                                  XLNetTokenizer,
-                                  DistilBertConfig,
-                                  DistilBertForSequenceClassification,
-                                  DistilBertTokenizer,
-                                  AlbertConfig,
-                                  AlbertForSequenceClassification, 
-                                  AlbertTokenizer,
-                                  XLMRobertaConfig,
-                                  XLMRobertaForSequenceClassification,
-                                  XLMRobertaTokenizer,
-                                )
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForSequenceClassification,
+    BertTokenizer,
+    RobertaConfig,
+    RobertaForSequenceClassification,
+    RobertaTokenizer,
+    XLMConfig,
+    XLMForSequenceClassification,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetForSequenceClassification,
+    XLNetTokenizer,
+    DistilBertConfig,
+    DistilBertForSequenceClassification,
+    DistilBertTokenizer,
+    AlbertConfig,
+    AlbertForSequenceClassification,
+    AlbertTokenizer,
+    XLMRobertaConfig,
+    XLMRobertaForSequenceClassification,
+    XLMRobertaTokenizer,
+)

 from transformers import AdamW, get_linear_schedule_with_warmup

@@ -66,17 +70,22 @@ from transformers import glue_convert_examples_to_features as convert_examples_t

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, 
-                                                                                RobertaConfig, DistilBertConfig)), ())
+ALL_MODELS = sum(
+    (
+        tuple(conf.pretrained_config_archive_map.keys())
+        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
+    ),
+    (),
+)

 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
-    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
-    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
-    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
-    'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
-    'xlmroberta': (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer),
+    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
+    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
+    "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
+    "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
+    "xlmroberta": (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer),
 }


@@ -104,20 +113,27 @@ def train(args, train_dataset, model, tokenizer):
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )

    # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
        # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
@@ -132,17 +148,21 @@ def train(args, train_dataset, model, tokenizer):

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

@@ -152,7 +172,7 @@ def train(args, train_dataset, model, tokenizer):
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
+        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

@@ -163,7 +183,9 @@ def train(args, train_dataset, model, tokenizer):

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
-    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
@@ -176,16 +198,16 @@ def train(args, train_dataset, model, tokenizer):

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      'labels':         batch[3]}
-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = (
+                    batch[2] if args.model_type in ["bert", "xlnet"] else None
+                )  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

@@ -209,36 +231,40 @@ def train(args, train_dataset, model, tokenizer):

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logs = {}
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
-                            eval_key = 'eval_{}'.format(key)
+                            eval_key = "eval_{}".format(key)
                            logs[eval_key] = value

                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()[0]
-                    logs['learning_rate'] = learning_rate_scalar
-                    logs['loss'] = loss_scalar
+                    logs["learning_rate"] = learning_rate_scalar
+                    logs["loss"] = loss_scalar
                    logging_loss = tr_loss

                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
-                    print(json.dumps({**logs, **{'step': global_step}}))
+                    print(json.dumps({**logs, **{"step": global_step}}))

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
@@ -257,7 +283,7 @@ def train(args, train_dataset, model, tokenizer):
 def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
-    eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
+    eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)

    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
@@ -288,11 +314,11 @@ def evaluate(args, model, tokenizer, prefix=""):
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
-                inputs = {'input_ids':      batch[0],
-                          'attention_mask': batch[1],
-                          'labels':         batch[3]}
-                if args.model_type != 'distilbert':
-                    inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
+                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+                if args.model_type != "distilbert":
+                    inputs["token_type_ids"] = (
+                        batch[2] if args.model_type in ["bert", "xlnet"] else None
+                    )  # XLM, DistilBERT and RoBERTa don't use segment_ids
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

@@ -300,10 +326,10 @@ def evaluate(args, model, tokenizer, prefix=""):
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs['labels'].detach().cpu().numpy()
+                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        if args.output_mode == "classification":
@@ -330,29 +356,36 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
-    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
-        'dev' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length),
-        str(task)))
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+            str(task),
+        ),
+    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
-        if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta', 'xlmroberta']:
+        if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta", "xlmroberta"]:
            # HACK(label indices are swapped in RoBERTa pretrained model)
            label_list[1], label_list[2] = label_list[2], label_list[1]
-        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-        features = convert_examples_to_features(examples,
-                                                tokenizer,
-                                                label_list=label_list,
-                                                max_length=args.max_seq_length,
-                                                output_mode=output_mode,
-                                                pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
-                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-                                                pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
+        examples = (
+            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+        )
+        features = convert_examples_to_features(
+            examples,
+            tokenizer,
+            label_list=label_list,
+            max_length=args.max_seq_length,
+            output_mode=output_mode,
+            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@@ -369,7 +402,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
- 
+
    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset

@@ -378,90 +411,149 @@ def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--task_name", default=None, type=str, required=True,
-                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )

    ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )

-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")     
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    args = parser.parse_args()

-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
+
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -473,16 +565,24 @@ def main():
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )

    # Set seed
    set_seed(args)
@@ -502,17 +602,23 @@ def main():

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels,
-                                          finetuning_task=args.task_name,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -521,14 +627,12 @@ def main():

    logger.info("Training/evaluation parameters %s", args)

-
    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

-
    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
@@ -538,36 +642,39 @@ def main():
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

-
    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -42,37 +42,55 @@ except:

 from tqdm import tqdm, trange

-from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
-                                  BertConfig, BertForMaskedLM, BertTokenizer,
-                                  GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
-                                  OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
-                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
-                                  DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer,
-                                  CamembertConfig, CamembertForMaskedLM, CamembertTokenizer)
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    get_linear_schedule_with_warmup,
+    BertConfig,
+    BertForMaskedLM,
+    BertTokenizer,
+    GPT2Config,
+    GPT2LMHeadModel,
+    GPT2Tokenizer,
+    OpenAIGPTConfig,
+    OpenAIGPTLMHeadModel,
+    OpenAIGPTTokenizer,
+    RobertaConfig,
+    RobertaForMaskedLM,
+    RobertaTokenizer,
+    DistilBertConfig,
+    DistilBertForMaskedLM,
+    DistilBertTokenizer,
+    CamembertConfig,
+    CamembertForMaskedLM,
+    CamembertTokenizer,
+)


 logger = logging.getLogger(__name__)


 MODEL_CLASSES = {
-    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
-    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
-    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
-    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
-    'camembert': (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer)
+    "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
+    "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
+    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
+    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
+    "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),
 }


 class TextDataset(Dataset):
-    def __init__(self, tokenizer, args, file_path='train', block_size=512):
+    def __init__(self, tokenizer, args, file_path="train", block_size=512):
        assert os.path.isfile(file_path)
        directory, filename = os.path.split(file_path)
-        cached_features_file = os.path.join(directory, args.model_name_or_path + '_cached_lm_' + str(block_size) + '_' + filename)
+        cached_features_file = os.path.join(
+            directory, args.model_name_or_path + "_cached_lm_" + str(block_size) + "_" + filename
+        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
-            with open(cached_features_file, 'rb') as handle:
+            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)
@@ -83,14 +101,14 @@ class TextDataset(Dataset):

            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

-            for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
-                self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size]))
+            for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
+                self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]))
            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
            # If your dataset is small, first you should loook for a bigger one :-) and second you
            # can change this behavior by adding (model specific) padding.

            logger.info("Saving features into cached file %s", cached_features_file)
-            with open(cached_features_file, 'wb') as handle:
+            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
@@ -101,7 +119,12 @@ class TextDataset(Dataset):


 def load_and_cache_examples(args, tokenizer, evaluate=False):
-    dataset = TextDataset(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size)
+    dataset = TextDataset(
+        tokenizer,
+        args,
+        file_path=args.eval_data_file if evaluate else args.train_data_file,
+        block_size=args.block_size,
+    )
    return dataset


@@ -120,7 +143,7 @@ def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False):
        return

    # Check if we should delete older checkpoint(s)
-    glob_checkpoints = glob.glob(os.path.join(args.output_dir, '{}-*'.format(checkpoint_prefix)))
+    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))
    if len(glob_checkpoints) <= args.save_total_limit:
        return

@@ -129,7 +152,7 @@ def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False):
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
-            regex_match = re.match('.*{}-([0-9]+)'.format(checkpoint_prefix), path)
+            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

@@ -147,7 +170,9 @@ def mask_tokens(inputs, tokenizer, args):
    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, args.mlm_probability)
-    special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
+    special_tokens_mask = [
+        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens
@@ -181,19 +206,26 @@ def train(args, train_dataset, model, tokenizer):
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )

    # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
        # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
@@ -208,17 +240,21 @@ def train(args, train_dataset, model, tokenizer):

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

@@ -228,7 +264,7 @@ def train(args, train_dataset, model, tokenizer):
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
+        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

@@ -239,16 +275,18 @@ def train(args, train_dataset, model, tokenizer):

    tr_loss, logging_loss = 0.0, 0.0

-    model_to_resize = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+    model_to_resize = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model_to_resize.resize_token_embeddings(len(tokenizer))

    model.zero_grad()
-    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
    set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
-            
+
            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
@@ -285,31 +323,35 @@ def train(args, train_dataset, model, tokenizer):

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    checkpoint_prefix = 'checkpoint'
+                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step))
+                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
@@ -365,9 +407,7 @@ def evaluate(args, model, tokenizer, prefix=""):
    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

-    result = {
-        "perplexity": perplexity
-    }
+    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
@@ -383,107 +423,167 @@ def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
-    parser.add_argument("--train_data_file", default=None, type=str, required=True,
-                        help="The input training data file (a text file).")
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file)."
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )

    ## Other parameters
-    parser.add_argument("--eval_data_file", default=None, type=str,
-                        help="An optional input evaluation data file to evaluate the perplexity on (a text file).")
+    parser.add_argument(
+        "--eval_data_file",
+        default=None,
+        type=str,
+        help="An optional input evaluation data file to evaluate the perplexity on (a text file).",
+    )

-    parser.add_argument("--model_type", default="bert", type=str,
-                        help="The model architecture to be fine-tuned.")
-    parser.add_argument("--model_name_or_path", default="bert-base-cased", type=str,
-                        help="The model checkpoint for weights initialization.")
+    parser.add_argument("--model_type", default="bert", type=str, help="The model architecture to be fine-tuned.")
+    parser.add_argument(
+        "--model_name_or_path",
+        default="bert-base-cased",
+        type=str,
+        help="The model checkpoint for weights initialization.",
+    )

-    parser.add_argument("--mlm", action='store_true',
-                        help="Train with masked-language modeling loss instead of language modeling.")
-    parser.add_argument("--mlm_probability", type=float, default=0.15,
-                        help="Ratio of tokens to mask for masked language modeling loss")
+    parser.add_argument(
+        "--mlm", action="store_true", help="Train with masked-language modeling loss instead of language modeling."
+    )
+    parser.add_argument(
+        "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss"
+    )

-    parser.add_argument("--config_name", default="", type=str,
-                        help="Optional pretrained config name or path if not the same as model_name_or_path")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)")
-    parser.add_argument("--block_size", default=-1, type=int,
-                        help="Optional input sequence length after tokenization."
-                             "The training dataset will be truncated in block of this size for training."
-                             "Default to the model max input length for single sentence inputs (take into account special tokens).")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Run evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--config_name",
+        default="",
+        type=str,
+        help="Optional pretrained config name or path if not the same as model_name_or_path",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)",
+    )
+    parser.add_argument(
+        "--block_size",
+        default=-1,
+        type=int,
+        help="Optional input sequence length after tokenization."
+        "The training dataset will be truncated in block of this size for training."
+        "Default to the model max input length for single sentence inputs (take into account special tokens).",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )

-    parser.add_argument("--per_gpu_train_batch_size", default=4, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=4, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=1.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument('--save_total_limit', type=int, default=None,
-                        help='Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default')
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--save_total_limit",
+        type=int,
+        default=None,
+        help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default",
+    )
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    args = parser.parse_args()

    if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
-        raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
-                         "flag (masked language modeling).")
+        raise ValueError(
+            "BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
+            "flag (masked language modeling)."
+        )
    if args.eval_data_file is None and args.do_eval:
-        raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
-                         "or remove the --do_eval argument.")
+        raise ValueError(
+            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
+            "or remove the --do_eval argument."
+        )

-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
+
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -495,16 +595,24 @@ def main():
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )

    # Set seed
    set_seed(args)
@@ -514,18 +622,26 @@ def main():
        torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training download model & vocab

    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
    if args.block_size <= 0:
-        args.block_size = tokenizer.max_len_single_sentence  # Our input block size will be the max possible for the model
+        args.block_size = (
+            tokenizer.max_len_single_sentence
+        )  # Our input block size will be the max possible for the model
    args.block_size = min(args.block_size, tokenizer.max_len_single_sentence)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
    model.to(args.device)

    if args.local_rank == 0:
@@ -546,7 +662,6 @@ def main():
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

-
    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
@@ -556,35 +671,38 @@ def main():
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)

-
    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -26,8 +26,7 @@ import random

 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler

 try:
@@ -37,34 +36,38 @@ except:

 from tqdm import tqdm, trange

-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForMultipleChoice, BertTokenizer,
-                                  XLNetConfig, XLNetForMultipleChoice,
-                                  XLNetTokenizer, RobertaConfig,
-                                  RobertaForMultipleChoice, RobertaTokenizer)
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForMultipleChoice,
+    BertTokenizer,
+    XLNetConfig,
+    XLNetForMultipleChoice,
+    XLNetTokenizer,
+    RobertaConfig,
+    RobertaForMultipleChoice,
+    RobertaTokenizer,
+)

 from transformers import AdamW, get_linear_schedule_with_warmup

-from utils_multiple_choice import (convert_examples_to_features, processors)
+from utils_multiple_choice import convert_examples_to_features, processors

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig)), ())
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig)), ()
+)

 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForMultipleChoice, BertTokenizer),
-    'xlnet': (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
-    'roberta': (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer)
+    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
+    "roberta": (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer),
 }

+
 def select_field(features, field):
-    return [
-        [
-            choice[field]
-            for choice in feature.choices_features
-        ]
-        for feature in features
-    ]
+    return [[choice[field] for choice in feature.choices_features] for feature in features]


 def simple_accuracy(preds, labels):
@@ -95,13 +98,18 @@ def train(args, train_dataset, model, tokenizer):
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
    if args.fp16:
        try:
            from apex import amp
@@ -115,17 +123,21 @@ def train(args, train_dataset, model, tokenizer):

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

@@ -141,15 +153,19 @@ def train(args, train_dataset, model, tokenizer):
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
-                      'labels':         batch[3]}
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                "token_type_ids": batch[2]
+                if args.model_type in ["bert", "xlnet"]
+                else None,  # XLM don't use segment_ids
+                "labels": batch[3],
+            }
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

@@ -171,10 +187,12 @@ def train(args, train_dataset, model, tokenizer):

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                        if results["eval_acc"] > best_dev_acc:
                            best_dev_acc = results["eval_acc"]
                            best_dev_loss = results["eval_loss"]
@@ -182,22 +200,33 @@ def train(args, train_dataset, model, tokenizer):
                            if args.do_test:
                                results_test = evaluate(args, model, tokenizer, test=True)
                                for key, value in results_test.items():
-                                    tb_writer.add_scalar('test_{}'.format(key), value, global_step)
-                                logger.info("test acc: %s, loss: %s, global steps: %s", str(results_test['eval_acc']), str(results_test['eval_loss']), str(global_step))
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
-                    logger.info("Average loss: %s at global step: %s", str((tr_loss - logging_loss)/args.logging_steps), str(global_step))
+                                    tb_writer.add_scalar("test_{}".format(key), value, global_step)
+                                logger.info(
+                                    "test acc: %s, loss: %s, global steps: %s",
+                                    str(results_test["eval_acc"]),
+                                    str(results_test["eval_loss"]),
+                                    str(global_step),
+                                )
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
+                    logger.info(
+                        "Average loss: %s at global step: %s",
+                        str((tr_loss - logging_loss) / args.logging_steps),
+                        str(global_step),
+                    )
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_vocabulary(output_dir)
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
@@ -246,10 +275,14 @@ def evaluate(args, model, tokenizer, prefix="", test=False):
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
-                inputs = {'input_ids':      batch[0],
-                          'attention_mask': batch[1],
-                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
-                          'labels':         batch[3]}
+                inputs = {
+                    "input_ids": batch[0],
+                    "attention_mask": batch[1],
+                    "token_type_ids": batch[2]
+                    if args.model_type in ["bert", "xlnet"]
+                    else None,  # XLM don't use segment_ids
+                    "labels": batch[3],
+                }
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

@@ -257,10 +290,10 @@ def evaluate(args, model, tokenizer, prefix="", test=False):
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs['labels'].detach().cpu().numpy()
+                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        preds = np.argmax(preds, axis=1)
@@ -273,8 +306,14 @@ def evaluate(args, model, tokenizer, prefix="", test=False):
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(str(prefix) + " is test:" + str(test)))
            writer.write("model           =%s\n" % str(args.model_name_or_path))
-            writer.write("total batch size=%d\n" % (args.per_gpu_train_batch_size * args.gradient_accumulation_steps *
-                         (torch.distributed.get_world_size() if args.local_rank != -1 else 1)))
+            writer.write(
+                "total batch size=%d\n"
+                % (
+                    args.per_gpu_train_batch_size
+                    * args.gradient_accumulation_steps
+                    * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)
+                )
+            )
            writer.write("train num epochs=%d\n" % args.num_train_epochs)
            writer.write("fp16            =%s\n" % args.fp16)
            writer.write("max seq length  =%d\n" % args.max_seq_length)
@@ -291,17 +330,21 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
    processor = processors[task]()
    # Load data features from cache or dataset file
    if evaluate:
-        cached_mode = 'dev'
+        cached_mode = "dev"
    elif test:
-        cached_mode = 'test'
+        cached_mode = "test"
    else:
-        cached_mode = 'train'
+        cached_mode = "train"
    assert (evaluate == True and test == True) == False
-    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
-        cached_mode,
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length),
-        str(task)))
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}_{}".format(
+            cached_mode,
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+            str(task),
+        ),
+    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
@@ -320,8 +363,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
            label_list,
            args.max_seq_length,
            tokenizer,
-            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
-            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0
+            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
+            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@@ -331,9 +374,9 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long)
-    all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long)
-    all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long)
+    all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
+    all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
+    all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
    all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
@@ -344,91 +387,150 @@ def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--task_name", default=None, type=str, required=True,
-                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )

    ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set')
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Run evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_test", action="store_true", help="Whether to run test on the test set")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )

-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    args = parser.parse_args()

-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
+
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -440,16 +542,24 @@ def main():
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )

    # Set seed
    set_seed(args)
@@ -468,17 +578,23 @@ def main():

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels,
-                                          finetuning_task=args.task_name,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -494,7 +610,6 @@ def main():
        global_step, tr_loss, best_steps = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

-
    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
@@ -504,19 +619,20 @@ def main():
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

-
    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
@@ -524,17 +640,19 @@ def main():
            args.output_dir = args.model_name_or_path
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    if args.do_test and args.local_rank in [-1, 0]:
@@ -546,13 +664,13 @@ def main():
        #     logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=prefix, test=True)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)
    if best_steps:
        logger.info("best steps of eval acc is the following checkpoints: %s", best_steps)
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -43,9 +43,12 @@ from transformers import XLMRobertaConfig, XLMRobertaForTokenClassification, XLM
 logger = logging.getLogger(__name__)

 ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig,
-                                                                   CamembertConfig, XLMRobertaConfig)),
-    ())
+    (
+        tuple(conf.pretrained_config_archive_map.keys())
+        for conf in (BertConfig, RobertaConfig, DistilBertConfig, CamembertConfig, XLMRobertaConfig)
+    ),
+    (),
+)

 MODEL_CLASSES = {
    "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
@@ -82,18 +85,24 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
-        {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-         "weight_decay": args.weight_decay},
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )

    # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
        # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
@@ -108,18 +117,21 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                args.train_batch_size * args.gradient_accumulation_steps * (
-                    torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

@@ -129,7 +141,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
+        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

@@ -140,7 +152,9 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
-    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
@@ -153,11 +167,11 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {"input_ids": batch[0],
-                      "attention_mask": batch[1],
-                      "labels": batch[3]}
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            if args.model_type != "distilbert":
-                inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None  # XLM and RoBERTa don"t use segment_ids
+                inputs["token_type_ids"] = (
+                    batch[2] if args.model_type in ["bert", "xlnet"] else None
+                )  # XLM and RoBERTa don"t use segment_ids

            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
@@ -187,7 +201,9 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev")
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
@@ -200,15 +216,17 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
@@ -249,11 +267,11 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
-            inputs = {"input_ids": batch[0],
-                      "attention_mask": batch[1],
-                      "labels": batch[3]}
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            if args.model_type != "distilbert":
-                inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None  # XLM and RoBERTa don"t use segment_ids
+                inputs["token_type_ids"] = (
+                    batch[2] if args.model_type in ["bert", "xlnet"] else None
+                )  # XLM and RoBERTa don"t use segment_ids
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

@@ -287,7 +305,7 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
        "loss": eval_loss,
        "precision": precision_score(out_label_list, preds_list),
        "recall": recall_score(out_label_list, preds_list),
-        "f1": f1_score(out_label_list, preds_list)
+        "f1": f1_score(out_label_list, preds_list),
    }

    logger.info("***** Eval results %s *****", prefix)
@@ -302,29 +320,36 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
-    cached_features_file = os.path.join(args.data_dir, "cached_{}_{}_{}".format(mode,
-        list(filter(None, args.model_name_or_path.split("/"))).pop(),
-        str(args.max_seq_length)))
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}".format(
+            mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length)
+        ),
+    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        examples = read_examples_from_file(args.data_dir, mode)
-        features = convert_examples_to_features(examples, labels, args.max_seq_length, tokenizer,
-                                                cls_token_at_end=bool(args.model_type in ["xlnet"]),
-                                                # xlnet has a cls token at the end
-                                                cls_token=tokenizer.cls_token,
-                                                cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
-                                                sep_token=tokenizer.sep_token,
-                                                sep_token_extra=bool(args.model_type in ["roberta"]),
-                                                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
-                                                pad_on_left=bool(args.model_type in ["xlnet"]),
-                                                # pad on the left for xlnet
-                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-                                                pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
-                                                pad_token_label_id=pad_token_label_id
-                                                )
+        features = convert_examples_to_features(
+            examples,
+            labels,
+            args.max_seq_length,
+            tokenizer,
+            cls_token_at_end=bool(args.model_type in ["xlnet"]),
+            # xlnet has a cls token at the end
+            cls_token=tokenizer.cls_token,
+            cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
+            sep_token=tokenizer.sep_token,
+            sep_token_extra=bool(args.model_type in ["roberta"]),
+            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
+            pad_on_left=bool(args.model_type in ["xlnet"]),
+            # pad on the left for xlnet
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+            pad_token_label_id=pad_token_label_id,
+        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)
@@ -346,95 +371,151 @@ def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )

    ## Other parameters
-    parser.add_argument("--labels", default="", type=str,
-                        help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--do_train", action="store_true",
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true",
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_predict", action="store_true",
-                        help="Whether to run predictions on the test set.")
-    parser.add_argument("--evaluate_during_training", action="store_true",
-                        help="Whether to run evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action="store_true",
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--labels",
+        default="",
+        type=str,
+        help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
+    )
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
+    parser.add_argument(
+        "--evaluate_during_training",
+        action="store_true",
+        help="Whether to run evaluation during training at each logging step.",
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )

-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument("--gradient_accumulation_steps", type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

-    parser.add_argument("--logging_steps", type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action="store_true",
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action="store_true",
-                        help="Avoid using CUDA when available")
-    parser.add_argument("--overwrite_output_dir", action="store_true",
-                        help="Overwrite the content of the output directory")
-    parser.add_argument("--overwrite_cache", action="store_true",
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument("--seed", type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

-    parser.add_argument("--fp16", action="store_true",
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument("--fp16_opt_level", type=str, default="O1",
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    args = parser.parse_args()

-    if os.path.exists(args.output_dir) and os.listdir(
-            args.output_dir) and args.do_train and not args.overwrite_output_dir:
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir))
+                args.output_dir
+            )
+        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
+
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -451,11 +532,19 @@ def main():
    args.device = device

    # Setup logging
-    logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-                        datefmt="%m/%d/%Y %H:%M:%S",
-                        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                   args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )

    # Set seed
    set_seed(args)
@@ -472,16 +561,22 @@ def main():

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool(".ckpt" in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -505,7 +600,9 @@ def main():
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

@@ -518,7 +615,9 @@ def main():
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
@@ -565,4 +664,3 @@ def main():

 if __name__ == "__main__":
    main()
-
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -17,7 +17,11 @@

 from __future__ import absolute_import, division, print_function
 from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor, SquadResult
-from transformers.data.metrics.squad_metrics import compute_predictions_logits, compute_predictions_log_probs, squad_evaluate
+from transformers.data.metrics.squad_metrics import (
+    compute_predictions_logits,
+    compute_predictions_log_probs,
+    squad_evaluate,
+)

 import argparse
 import logging
@@ -27,8 +31,7 @@ import glob
 import timeit
 import numpy as np
 import torch
-from torch.utils.data import (
-    DataLoader, RandomSampler, SequentialSampler, TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler

 try:
@@ -38,32 +41,47 @@ except:

 from tqdm import tqdm, trange

-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForQuestionAnswering, BertTokenizer,
-                                  RobertaForQuestionAnswering, RobertaTokenizer, RobertaConfig,
-                                  XLMConfig, XLMForQuestionAnswering,
-                                  XLMTokenizer, XLNetConfig,
-                                  XLNetForQuestionAnswering,
-                                  XLNetTokenizer,
-                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer,
-                                  AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer,
-                                  XLMConfig, XLMForQuestionAnswering, XLMTokenizer,
-                                  )
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForQuestionAnswering,
+    BertTokenizer,
+    RobertaForQuestionAnswering,
+    RobertaTokenizer,
+    RobertaConfig,
+    XLMConfig,
+    XLMForQuestionAnswering,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetForQuestionAnswering,
+    XLNetTokenizer,
+    DistilBertConfig,
+    DistilBertForQuestionAnswering,
+    DistilBertTokenizer,
+    AlbertConfig,
+    AlbertForQuestionAnswering,
+    AlbertTokenizer,
+    XLMConfig,
+    XLMForQuestionAnswering,
+    XLMTokenizer,
+)

 from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
-                  for conf in (BertConfig, RobertaConfig, XLNetConfig, XLMConfig)), ())
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, XLNetConfig, XLMConfig)),
+    (),
+)

 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
-    'roberta': (RobertaConfig, RobertaForQuestionAnswering, RobertaTokenizer),
-    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
-    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
-    'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
+    "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
+    "roberta": (RobertaConfig, RobertaForQuestionAnswering, RobertaTokenizer),
+    "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
+    "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
+    "albert": (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
 }


@@ -85,49 +103,44 @@ def train(args, train_dataset, model, tokenizer):
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(
-        train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(
-        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (
-            len(train_dataloader) // args.gradient_accumulation_steps) + 1
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
-        t_total = len(
-            train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(
-            nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(
-            nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
-    optimizer = AdamW(optimizer_grouped_parameters,
-                      lr=args.learning_rate, eps=args.adam_epsilon)
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )

    # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
        # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(
-            os.path.join(args.model_name_or_path, 'optimizer.pt')))
-        scheduler.load_state_dict(torch.load(
-            os.path.join(args.model_name_or_path, 'scheduler.pt')))
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
-            raise ImportError(
-                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")

-        model, optimizer = amp.initialize(
-            model, optimizer, opt_level=args.fp16_opt_level)
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
@@ -135,20 +148,22 @@ def train(args, train_dataset, model, tokenizer):

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d",
-                args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
-    logger.info("  Gradient Accumulation steps = %d",
-                args.gradient_accumulation_steps)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 1
@@ -157,29 +172,25 @@ def train(args, train_dataset, model, tokenizer):
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
-        epochs_trained = global_step // (len(train_dataloader) //
-                                         args.gradient_accumulation_steps)
-        steps_trained_in_current_epoch = global_step % (
-            len(train_dataloader) // args.gradient_accumulation_steps)
+        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
+        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
+        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

-        logger.info(
-            "  Continuing training from checkpoint, will skip to saved global_step")
+        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
-        logger.info("  Will skip the first %d steps in the first epoch",
-                    steps_trained_in_current_epoch)
+        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
-    train_iterator = trange(epochs_trained, int(
-        args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
    # Added here for reproductibility (even between python 2 and 3)
    set_seed(args)

    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration",
-                              disable=args.local_rank not in [-1, 0])
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
@@ -191,18 +202,17 @@ def train(args, train_dataset, model, tokenizer):
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
-                'input_ids':       batch[0],
-                'attention_mask':  batch[1],
-                'token_type_ids': None if args.model_type in ['xlm', 'roberta', 'distilbert'] else batch[2],
-                'start_positions': batch[3],
-                'end_positions':   batch[4],
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                "token_type_ids": None if args.model_type in ["xlm", "roberta", "distilbert"] else batch[2],
+                "start_positions": batch[3],
+                "end_positions": batch[4],
            }

-            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[5],
-                               'p_mask':       batch[6]})
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
                if args.version_2_with_negative:
-                    inputs.update({'is_impossible': batch[7]})
+                    inputs.update({"is_impossible": batch[7]})
            outputs = model(**inputs)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]
@@ -221,11 +231,9 @@ def train(args, train_dataset, model, tokenizer):
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(
-                        amp.master_params(optimizer), args.max_grad_norm)
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
-                    torch.nn.utils.clip_grad_norm_(
-                        model.parameters(), args.max_grad_norm)
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
@@ -238,36 +246,27 @@ def train(args, train_dataset, model, tokenizer):
                    if args.local_rank == -1 and args.evaluate_during_training:
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
-                            tb_writer.add_scalar(
-                                'eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar(
-                        'lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar(
-                        'loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                # Save model checkpoint
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    output_dir = os.path.join(
-                        args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Take care of distributed/parallel training
-                    model_to_save = model.module if hasattr(
-                        model, 'module') else model
+                    model_to_save = model.module if hasattr(model, "module") else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

-                    torch.save(args, os.path.join(
-                        output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

-                    torch.save(optimizer.state_dict(), os.path.join(
-                        output_dir, 'optimizer.pt'))
-                    torch.save(scheduler.state_dict(), os.path.join(
-                        output_dir, 'scheduler.pt'))
-                    logger.info(
-                        "Saving optimizer and scheduler states to %s", output_dir)
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
+                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
@@ -283,8 +282,7 @@ def train(args, train_dataset, model, tokenizer):


 def evaluate(args, model, tokenizer, prefix=""):
-    dataset, examples, features = load_and_cache_examples(
-        args, tokenizer, evaluate=True, output_examples=True)
+    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)
@@ -293,8 +291,7 @@ def evaluate(args, model, tokenizer, prefix=""):

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
-    eval_dataloader = DataLoader(
-        dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
@@ -314,15 +311,15 @@ def evaluate(args, model, tokenizer, prefix=""):

        with torch.no_grad():
            inputs = {
-                'input_ids':      batch[0],
-                'attention_mask': batch[1],
-                'token_type_ids': None if args.model_type in ['xlm', 'roberta', 'distilbert'] else batch[2],
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                "token_type_ids": None if args.model_type in ["xlm", "roberta", "distilbert"] else batch[2],
            }
            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
-            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})

            outputs = model(**inputs)

@@ -342,53 +339,68 @@ def evaluate(args, model, tokenizer, prefix=""):
                cls_logits = output[4]

                result = SquadResult(
-                    unique_id, start_logits, end_logits,
+                    unique_id,
+                    start_logits,
+                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
-                    cls_logits=cls_logits
+                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
-                result = SquadResult(
-                    unique_id, start_logits, end_logits
-                )
+                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
-    logger.info("  Evaluation done in total %f secs (%f sec per example)",
-                evalTime, evalTime / len(dataset))
+    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
-    output_prediction_file = os.path.join(
-        args.output_dir, "predictions_{}.json".format(prefix))
-    output_nbest_file = os.path.join(
-        args.output_dir, "nbest_predictions_{}.json".format(prefix))
+    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
+    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
-        output_null_log_odds_file = os.path.join(
-            args.output_dir, "null_odds_{}.json".format(prefix))
+        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
-    if args.model_type in ['xlnet', 'xlm']:
-        start_n_top = model.config.start_n_top if hasattr(
-            model, "config") else model.module.config.start_n_top
-        end_n_top = model.config.end_n_top if hasattr(
-            model, "config") else model.module.config.end_n_top
+    if args.model_type in ["xlnet", "xlm"]:
+        start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
+        end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top

-        predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size,
-                                                    args.max_answer_length, output_prediction_file,
-                                                    output_nbest_file, output_null_log_odds_file,
-                                                    start_n_top, end_n_top,
-                                                    args.version_2_with_negative, tokenizer, args.verbose_logging)
+        predictions = compute_predictions_log_probs(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            start_n_top,
+            end_n_top,
+            args.version_2_with_negative,
+            tokenizer,
+            args.verbose_logging,
+        )
    else:
-        predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size,
-                        args.max_answer_length, args.do_lower_case, output_prediction_file,
-                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
-                        args.version_2_with_negative, args.null_score_diff_threshold, tokenizer)
+        predictions = compute_predictions_logits(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            args.do_lower_case,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            args.verbose_logging,
+            args.version_2_with_negative,
+            args.null_score_diff_threshold,
+            tokenizer,
+        )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
@@ -402,16 +414,18 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal

    # Load data features from cache or dataset file
    input_dir = args.data_dir if args.data_dir else "."
-    cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format(
-        'dev' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length))
+    cached_features_file = os.path.join(
+        input_dir,
+        "cached_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
-        logger.info("Loading features from cached file %s",
-                    cached_features_file)
+        logger.info("Loading features from cached file %s", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset = features_and_dataset["features"], features_and_dataset["dataset"]
    else:
@@ -421,16 +435,13 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
            try:
                import tensorflow_datasets as tfds
            except ImportError:
-                raise ImportError(
-                    "If not data_dir is specified, tensorflow_datasets needs to be installed.")
+                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")

            if args.version_2_with_negative:
-                logger.warn(
-                    "tensorflow_datasets does not handle version 2 of SQuAD.")
+                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
-            examples = SquadV1Processor().get_examples_from_dataset(
-                tfds_examples, evaluate=evaluate)
+            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
        else:
            processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
            if evaluate:
@@ -445,15 +456,13 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
-            return_dataset='pt',
+            return_dataset="pt",
            threads=args.threads,
        )

        if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s",
-                        cached_features_file)
-            torch.save({"features": features, "dataset": dataset},
-                       cached_features_file)
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save({"features": features, "dataset": dataset}, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
@@ -468,140 +477,232 @@ def main():
    parser = argparse.ArgumentParser()

    # Required parameters
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model checkpoints and predictions will be written.")
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model checkpoints and predictions will be written.",
+    )

    # Other parameters
-    parser.add_argument("--data_dir", default=None, type=str,
-                        help="The input data dir. Should contain the .json files for the task." +
-                             "If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
-    parser.add_argument("--train_file", default=None, type=str,
-                        help="The input training file. If a data dir is specified, will look for the file there" +
-                             "If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
-    parser.add_argument("--predict_file", default=None, type=str,
-                        help="The input evaluation file. If a data dir is specified, will look for the file there" +
-                             "If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        help="The input data dir. Should contain the .json files for the task."
+        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
+    )
+    parser.add_argument(
+        "--train_file",
+        default=None,
+        type=str,
+        help="The input training file. If a data dir is specified, will look for the file there"
+        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
+    )
+    parser.add_argument(
+        "--predict_file",
+        default=None,
+        type=str,
+        help="The input evaluation file. If a data dir is specified, will look for the file there"
+        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
+    )
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )

-    parser.add_argument('--version_2_with_negative', action='store_true',
-                        help='If true, the SQuAD examples contain some that do not have an answer.')
-    parser.add_argument('--null_score_diff_threshold', type=float, default=0.0,
-                        help="If null_score - best_non_null is greater than the threshold predict null.")
+    parser.add_argument(
+        "--version_2_with_negative",
+        action="store_true",
+        help="If true, the SQuAD examples contain some that do not have an answer.",
+    )
+    parser.add_argument(
+        "--null_score_diff_threshold",
+        type=float,
+        default=0.0,
+        help="If null_score - best_non_null is greater than the threshold predict null.",
+    )

-    parser.add_argument("--max_seq_length", default=384, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
-                             "longer than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--doc_stride", default=128, type=int,
-                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
-    parser.add_argument("--max_query_length", default=64, type=int,
-                        help="The maximum number of tokens for the question. Questions longer than this will "
-                             "be truncated to this length.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--max_seq_length",
+        default=384,
+        type=int,
+        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+        "longer than this will be truncated, and sequences shorter than this will be padded.",
+    )
+    parser.add_argument(
+        "--doc_stride",
+        default=128,
+        type=int,
+        help="When splitting up a long document into chunks, how much stride to take between chunks.",
+    )
+    parser.add_argument(
+        "--max_query_length",
+        default=64,
+        type=int,
+        help="The maximum number of tokens for the question. Questions longer than this will "
+        "be truncated to this length.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )

-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-    parser.add_argument("--n_best_size", default=20, type=int,
-                        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
-    parser.add_argument("--max_answer_length", default=30, type=int,
-                        help="The maximum length of an answer that can be generated. This is needed because the start "
-                             "and end predictions are not conditioned on one another.")
-    parser.add_argument("--verbose_logging", action='store_true',
-                        help="If true, all of the warnings related to data processing will be printed. "
-                             "A number of warnings are expected for a normal SQuAD evaluation.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+    parser.add_argument(
+        "--n_best_size",
+        default=20,
+        type=int,
+        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
+    )
+    parser.add_argument(
+        "--max_answer_length",
+        default=30,
+        type=int,
+        help="The maximum length of an answer that can be generated. This is needed because the start "
+        "and end predictions are not conditioned on one another.",
+    )
+    parser.add_argument(
+        "--verbose_logging",
+        action="store_true",
+        help="If true, all of the warnings related to data processing will be printed. "
+        "A number of warnings are expected for a normal SQuAD evaluation.",
+    )

-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")

-    parser.add_argument('--threads', type=int, default=1, help='multiple threads for converting example to features')
+    parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")
    args = parser.parse_args()

-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
+
        print("Waiting for debugger attach")
-        ptvsd.enable_attach(
-            address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device(
-            "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
-    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt='%m/%d/%Y %H:%M:%S',
-                        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                   args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )

    # Set seed
    set_seed(args)
@@ -613,16 +714,21 @@ def main():

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool(
-                                            '.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
@@ -638,18 +744,16 @@ def main():
    if args.fp16:
        try:
            import apex
-            apex.amp.register_half_function(torch, 'einsum')
+
+            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
-            raise ImportError(
-                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")

    # Training
    if args.do_train:
-        train_dataset = load_and_cache_examples(
-            args, tokenizer, evaluate=False, output_examples=False)
+        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
-        logger.info(" global_step = %s, average loss = %s",
-                    global_step, tr_loss)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Save the trained model and the tokenizer
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
@@ -661,18 +765,16 @@ def main():
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        # Take care of distributed/parallel training
-        model_to_save = model.module if hasattr(model, 'module') else model
+        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(
-            args.output_dir, force_download=True)
-        tokenizer = tokenizer_class.from_pretrained(
-            args.output_dir, do_lower_case=args.do_lower_case)
+        model = model_class.from_pretrained(args.output_dir, force_download=True)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)

    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
@@ -682,7 +784,10 @@ def main():
            logger.info("Loading checkpoints saved during training for evaluation")
            checkpoints = [args.output_dir]
            if args.eval_all_checkpoints:
-                checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+                checkpoints = list(
+                    os.path.dirname(c)
+                    for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+                )
                logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
        else:
            logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
@@ -692,17 +797,14 @@ def main():

        for checkpoint in checkpoints:
            # Reload the model
-            global_step = checkpoint.split(
-                '-')[-1] if len(checkpoints) > 1 else ""
-            model = model_class.from_pretrained(
-                checkpoint, force_download=True)
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            model = model_class.from_pretrained(checkpoint, force_download=True)
            model.to(args.device)

            # Evaluate
            result = evaluate(args, model, tokenizer, prefix=global_step)

-            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v)
-                          for k, v in result.items())
+            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
            results.update(result)

    logger.info("Results: {}".format(results))
--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
@@ -1,7 +1,14 @@
 import os
 import tensorflow as tf
 import tensorflow_datasets
-from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig, glue_convert_examples_to_features, BertForSequenceClassification, glue_processors
+from transformers import (
+    BertTokenizer,
+    TFBertForSequenceClassification,
+    BertConfig,
+    glue_convert_examples_to_features,
+    BertForSequenceClassification,
+    glue_processors,
+)

 # script parameters
 BATCH_SIZE = 32
@@ -16,7 +23,7 @@ if TASK == "sst-2":
    TFDS_TASK = "sst2"
 elif TASK == "sts-b":
    TFDS_TASK = "stsb"
-else: 
+else:
    TFDS_TASK = TASK

 num_labels = len(glue_processors[TASK]().get_labels())
@@ -27,29 +34,29 @@ tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})

 # Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
 config = BertConfig.from_pretrained("bert-base-cased", num_labels=num_labels)
-tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', config=config)
+tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+model = TFBertForSequenceClassification.from_pretrained("bert-base-cased", config=config)

 # Load dataset via TensorFlow Datasets
-data, info = tensorflow_datasets.load(f'glue/{TFDS_TASK}', with_info=True)
-train_examples = info.splits['train'].num_examples
+data, info = tensorflow_datasets.load(f"glue/{TFDS_TASK}", with_info=True)
+train_examples = info.splits["train"].num_examples

 # MNLI expects either validation_matched or validation_mismatched
-valid_examples = info.splits['validation'].num_examples
+valid_examples = info.splits["validation"].num_examples

 # Prepare dataset for GLUE as a tf.data.Dataset instance
-train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, TASK)
+train_dataset = glue_convert_examples_to_features(data["train"], tokenizer, 128, TASK)

 # MNLI expects either validation_matched or validation_mismatched
-valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, TASK)
+valid_dataset = glue_convert_examples_to_features(data["validation"], tokenizer, 128, TASK)
 train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
 valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)

-# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
+# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
 opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
 if USE_AMP:
    # loss scaling is currently required when using mixed precision
-    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
+    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic")


 if num_labels == 1:
@@ -57,37 +64,42 @@ if num_labels == 1:
 else:
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

-metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
 model.compile(optimizer=opt, loss=loss, metrics=[metric])

 # Train and evaluate using tf.keras.Model.fit()
-train_steps = train_examples//BATCH_SIZE
-valid_steps = valid_examples//EVAL_BATCH_SIZE
+train_steps = train_examples // BATCH_SIZE
+valid_steps = valid_examples // EVAL_BATCH_SIZE

-history = model.fit(train_dataset, epochs=EPOCHS, steps_per_epoch=train_steps,
-                    validation_data=valid_dataset, validation_steps=valid_steps)
+history = model.fit(
+    train_dataset,
+    epochs=EPOCHS,
+    steps_per_epoch=train_steps,
+    validation_data=valid_dataset,
+    validation_steps=valid_steps,
+)

 # Save TF2 model
-os.makedirs('./save/', exist_ok=True)
-model.save_pretrained('./save/')
+os.makedirs("./save/", exist_ok=True)
+model.save_pretrained("./save/")

 if TASK == "mrpc":
    # Load the TensorFlow model in PyTorch for inspection
-    # This is to demo the interoperability between the two frameworks, you don't have to 
+    # This is to demo the interoperability between the two frameworks, you don't have to
    # do this in real life (you can run the inference on the TF model).
-    pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
+    pytorch_model = BertForSequenceClassification.from_pretrained("./save/", from_tf=True)

    # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
-    sentence_0 = 'This research was consistent with his findings.'
-    sentence_1 = 'His findings were compatible with this research.'
-    sentence_2 = 'His findings were not compatible with this research.'
-    inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
-    inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
+    sentence_0 = "This research was consistent with his findings."
+    sentence_1 = "His findings were compatible with this research."
+    sentence_2 = "His findings were not compatible with this research."
+    inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors="pt")
+    inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors="pt")

    del inputs_1["special_tokens_mask"]
    del inputs_2["special_tokens_mask"]

    pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
    pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
-    print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0')
-    print('sentence_2 is', 'a paraphrase' if pred_2 else 'not a paraphrase', 'of sentence_0')
+    print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
+    print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
--- a/examples/run_tf_ner.py
+++ b/examples/run_tf_ner.py
@@ -21,189 +21,156 @@ from absl import app


 ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
-    ())
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)), ()
+)

 MODEL_CLASSES = {
    "bert": (BertConfig, TFBertForTokenClassification, BertTokenizer),
    "roberta": (RobertaConfig, TFRobertaForTokenClassification, RobertaTokenizer),
-    "distilbert": (DistilBertConfig, TFDistilBertForTokenClassification, DistilBertTokenizer)
+    "distilbert": (DistilBertConfig, TFDistilBertForTokenClassification, DistilBertTokenizer),
 }


 flags.DEFINE_string(
-    "data_dir", None,
-    "The input data dir. Should contain the .conll files (or other data files) "
-    "for the task.")
+    "data_dir", None, "The input data dir. Should contain the .conll files (or other data files) " "for the task."
+)
+
+flags.DEFINE_string("model_type", None, "Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))

 flags.DEFINE_string(
-    "model_type", None,
-    "Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    "model_name_or_path",
+    None,
+    "Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+)
+
+flags.DEFINE_string("output_dir", None, "The output directory where the model checkpoints will be written.")

 flags.DEFINE_string(
-    "model_name_or_path", None,
-    "Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+    "labels", "", "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."
+)

-flags.DEFINE_string(
-    "output_dir", None,
-    "The output directory where the model checkpoints will be written.")
+flags.DEFINE_string("config_name", "", "Pretrained config name or path if not the same as model_name")

-flags.DEFINE_string(
-    "labels", "",
-    "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
+flags.DEFINE_string("tokenizer_name", "", "Pretrained tokenizer name or path if not the same as model_name")

-flags.DEFINE_string(
-    "config_name", "",
-    "Pretrained config name or path if not the same as model_name")
-
-flags.DEFINE_string(
-    "tokenizer_name", "",
-    "Pretrained tokenizer name or path if not the same as model_name")
-
-flags.DEFINE_string(
-    "cache_dir", "",
-    "Where do you want to store the pre-trained models downloaded from s3")
+flags.DEFINE_string("cache_dir", "", "Where do you want to store the pre-trained models downloaded from s3")

 flags.DEFINE_integer(
-    "max_seq_length", 128,
+    "max_seq_length",
+    128,
    "The maximum total input sentence length after tokenization. "
    "Sequences longer than this will be truncated, sequences shorter "
-    "will be padded.")
+    "will be padded.",
+)

 flags.DEFINE_string(
-    "tpu", None,
+    "tpu",
+    None,
    "The Cloud TPU to use for training. This should be either the name "
    "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
-    "url.")
+    "url.",
+)

-flags.DEFINE_integer(
-    "num_tpu_cores", 8,
-    "Total number of TPU cores to use.")
+flags.DEFINE_integer("num_tpu_cores", 8, "Total number of TPU cores to use.")
+
+flags.DEFINE_boolean("do_train", False, "Whether to run training.")
+
+flags.DEFINE_boolean("do_eval", False, "Whether to run eval on the dev set.")
+
+flags.DEFINE_boolean("do_predict", False, "Whether to run predictions on the test set.")

 flags.DEFINE_boolean(
-    "do_train", False,
-    "Whether to run training.")
+    "evaluate_during_training", False, "Whether to run evaluation during training at each logging step."
+)

-flags.DEFINE_boolean(
-    "do_eval", False,
-    "Whether to run eval on the dev set.")
+flags.DEFINE_boolean("do_lower_case", False, "Set this flag if you are using an uncased model.")

-flags.DEFINE_boolean(
-    "do_predict", False,
-    "Whether to run predictions on the test set.")
+flags.DEFINE_integer("per_device_train_batch_size", 8, "Batch size per GPU/CPU/TPU for training.")

-flags.DEFINE_boolean(
-    "evaluate_during_training", False,
-    "Whether to run evaluation during training at each logging step.")
-
-flags.DEFINE_boolean(
-    "do_lower_case", False,
-    "Set this flag if you are using an uncased model.")
+flags.DEFINE_integer("per_device_eval_batch_size", 8, "Batch size per GPU/CPU/TPU for evaluation.")

 flags.DEFINE_integer(
-    "per_device_train_batch_size", 8,
-    "Batch size per GPU/CPU/TPU for training.")
+    "gradient_accumulation_steps", 1, "Number of updates steps to accumulate before performing a backward/update pass."
+)
+
+flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
+
+flags.DEFINE_float("weight_decay", 0.0, "Weight decay if we apply some.")
+
+flags.DEFINE_float("adam_epsilon", 1e-8, "Epsilon for Adam optimizer.")
+
+flags.DEFINE_float("max_grad_norm", 1.0, "Max gradient norm.")
+
+flags.DEFINE_integer("num_train_epochs", 3, "Total number of training epochs to perform.")

 flags.DEFINE_integer(
-    "per_device_eval_batch_size", 8,
-    "Batch size per GPU/CPU/TPU for evaluation.")
+    "max_steps", -1, "If > 0: set total number of training steps to perform. Override num_train_epochs."
+)

-flags.DEFINE_integer(
-    "gradient_accumulation_steps", 1,
-    "Number of updates steps to accumulate before performing a backward/update pass.")
+flags.DEFINE_integer("warmup_steps", 0, "Linear warmup over warmup_steps.")

-flags.DEFINE_float(
-    "learning_rate", 5e-5,
-    "The initial learning rate for Adam.")
+flags.DEFINE_integer("logging_steps", 50, "Log every X updates steps.")

-flags.DEFINE_float(
-    "weight_decay", 0.0,
-    "Weight decay if we apply some.")
-
-flags.DEFINE_float(
-    "adam_epsilon", 1e-8,
-    "Epsilon for Adam optimizer.")
-
-flags.DEFINE_float(
-    "max_grad_norm", 1.0,
-    "Max gradient norm.")
-
-flags.DEFINE_integer(
-    "num_train_epochs", 3,
-    "Total number of training epochs to perform.")
-
-flags.DEFINE_integer(
-    "max_steps", -1,
-    "If > 0: set total number of training steps to perform. Override num_train_epochs.")
-
-flags.DEFINE_integer(
-    "warmup_steps", 0,
-    "Linear warmup over warmup_steps.")
-
-flags.DEFINE_integer(
-    "logging_steps", 50,
-    "Log every X updates steps.")
-
-flags.DEFINE_integer(
-    "save_steps", 50,
-    "Save checkpoint every X updates steps.")
+flags.DEFINE_integer("save_steps", 50, "Save checkpoint every X updates steps.")

 flags.DEFINE_boolean(
-    "eval_all_checkpoints", False,
-    "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+    "eval_all_checkpoints",
+    False,
+    "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+)

-flags.DEFINE_boolean(
-    "no_cuda", False,
-    "Avoid using CUDA when available")
+flags.DEFINE_boolean("no_cuda", False, "Avoid using CUDA when available")

-flags.DEFINE_boolean(
-    "overwrite_output_dir", False,
-    "Overwrite the content of the output directory")
+flags.DEFINE_boolean("overwrite_output_dir", False, "Overwrite the content of the output directory")

-flags.DEFINE_boolean(
-    "overwrite_cache", False,
-    "Overwrite the cached training and evaluation sets")
+flags.DEFINE_boolean("overwrite_cache", False, "Overwrite the cached training and evaluation sets")

-flags.DEFINE_integer(
-    "seed", 42,
-    "random seed for initialization")
+flags.DEFINE_integer("seed", 42, "random seed for initialization")

-flags.DEFINE_boolean(
-    "fp16", False,
-    "Whether to use 16-bit (mixed) precision instead of 32-bit")
+flags.DEFINE_boolean("fp16", False, "Whether to use 16-bit (mixed) precision instead of 32-bit")

 flags.DEFINE_string(
-    "gpus", "0",
+    "gpus",
+    "0",
    "Comma separated list of gpus devices. If only one, switch to single "
-    "gpu strategy, if None takes all the gpus available.")
+    "gpu strategy, if None takes all the gpus available.",
+)


-def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id):
-    if args['max_steps'] > 0:
-        num_train_steps = args['max_steps'] * args['gradient_accumulation_steps']
-        args['num_train_epochs'] = 1
+def train(
+    args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id
+):
+    if args["max_steps"] > 0:
+        num_train_steps = args["max_steps"] * args["gradient_accumulation_steps"]
+        args["num_train_epochs"] = 1
    else:
-        num_train_steps = math.ceil(num_train_examples / train_batch_size) // args['gradient_accumulation_steps'] * args['num_train_epochs']
+        num_train_steps = (
+            math.ceil(num_train_examples / train_batch_size)
+            // args["gradient_accumulation_steps"]
+            * args["num_train_epochs"]
+        )

    writer = tf.summary.create_file_writer("/tmp/mylogs")

    with strategy.scope():
        loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
-        optimizer = create_optimizer(args['learning_rate'], num_train_steps, args['warmup_steps'])
+        optimizer = create_optimizer(args["learning_rate"], num_train_steps, args["warmup_steps"])

-        if args['fp16']:
-            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
+        if args["fp16"]:
+            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")

-        loss_metric = tf.keras.metrics.Mean(name='loss', dtype=tf.float32)
+        loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32)
        gradient_accumulator = GradientAccumulator()
-        
+
    logging.info("***** Running training *****")
    logging.info("  Num examples = %d", num_train_examples)
-    logging.info("  Num Epochs = %d", args['num_train_epochs'])
-    logging.info("  Instantaneous batch size per device = %d", args['per_device_train_batch_size'])
-    logging.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                train_batch_size * args['gradient_accumulation_steps'])
-    logging.info("  Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
+    logging.info("  Num Epochs = %d", args["num_train_epochs"])
+    logging.info("  Instantaneous batch size per device = %d", args["per_device_train_batch_size"])
+    logging.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        train_batch_size * args["gradient_accumulation_steps"],
+    )
+    logging.info("  Gradient Accumulation steps = %d", args["gradient_accumulation_steps"])
    logging.info("  Total training steps = %d", num_train_steps)

    model.summary()
@@ -214,26 +181,28 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l

        for gradient, variable in zip(gradient_accumulator.gradients, model.trainable_variables):
            if gradient is not None:
-                scaled_gradient = gradient / (args['n_device'] * args['gradient_accumulation_steps'])
+                scaled_gradient = gradient / (args["n_device"] * args["gradient_accumulation_steps"])
                grads_and_vars.append((scaled_gradient, variable))
            else:
                grads_and_vars.append((gradient, variable))

-        optimizer.apply_gradients(grads_and_vars, args['max_grad_norm'])
+        optimizer.apply_gradients(grads_and_vars, args["max_grad_norm"])
        gradient_accumulator.reset()

    @tf.function
    def train_step(train_features, train_labels):
        def step_fn(train_features, train_labels):
-            inputs = {'attention_mask': train_features['input_mask'], 'training': True}
+            inputs = {"attention_mask": train_features["input_mask"], "training": True}

-            if args['model_type'] != "distilbert":
-                inputs["token_type_ids"] = train_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None
+            if args["model_type"] != "distilbert":
+                inputs["token_type_ids"] = (
+                    train_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None
+                )

            with tf.GradientTape() as tape:
-                logits = model(train_features['input_ids'], **inputs)[0]
+                logits = model(train_features["input_ids"], **inputs)[0]
                logits = tf.reshape(logits, (-1, len(labels) + 1))
-                active_loss = tf.reshape(train_features['input_mask'], (-1,))
+                active_loss = tf.reshape(train_features["input_mask"], (-1,))
                active_logits = tf.boolean_mask(logits, active_loss)
                train_labels = tf.reshape(train_labels, (-1,))
                active_labels = tf.boolean_mask(train_labels, active_loss)
@@ -251,34 +220,40 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l
        return mean_loss

    current_time = datetime.datetime.now()
-    train_iterator = master_bar(range(args['num_train_epochs']))
+    train_iterator = master_bar(range(args["num_train_epochs"]))
    global_step = 0
    logging_loss = 0.0

    for epoch in train_iterator:
-        epoch_iterator = progress_bar(train_dataset, total=num_train_steps, parent=train_iterator, display=args['n_device'] > 1)
+        epoch_iterator = progress_bar(
+            train_dataset, total=num_train_steps, parent=train_iterator, display=args["n_device"] > 1
+        )
        step = 1

        with strategy.scope():
            for train_features, train_labels in epoch_iterator:
                loss = train_step(train_features, train_labels)

-                if step % args['gradient_accumulation_steps'] == 0:
+                if step % args["gradient_accumulation_steps"] == 0:
                    strategy.experimental_run_v2(apply_gradients)

                    loss_metric(loss)

                    global_step += 1

-                    if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
+                    if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0:
                        # Log metrics
-                        if args['n_device'] == 1 and args['evaluate_during_training']:  # Only evaluate when single GPU otherwise metrics may not average well
-                            y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
+                        if (
+                            args["n_device"] == 1 and args["evaluate_during_training"]
+                        ):  # Only evaluate when single GPU otherwise metrics may not average well
+                            y_true, y_pred, eval_loss = evaluate(
+                                args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev"
+                            )
                            report = metrics.classification_report(y_true, y_pred, digits=4)
-                            
+
                            logging.info("Eval at step " + str(global_step) + "\n" + report)
                            logging.info("eval_loss: " + str(eval_loss))
-                            
+
                            precision = metrics.precision_score(y_true, y_pred)
                            recall = metrics.recall_score(y_true, y_pred)
                            f1 = metrics.f1_score(y_true, y_pred)
@@ -288,33 +263,35 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l
                                tf.summary.scalar("precision", precision, global_step)
                                tf.summary.scalar("recall", recall, global_step)
                                tf.summary.scalar("f1", f1, global_step)
-                        
+
                        lr = optimizer.learning_rate
                        learning_rate = lr(step)

                        with writer.as_default():
                            tf.summary.scalar("lr", learning_rate, global_step)
-                            tf.summary.scalar("loss", (loss_metric.result() - logging_loss) / args['logging_steps'], global_step)
-                        
+                            tf.summary.scalar(
+                                "loss", (loss_metric.result() - logging_loss) / args["logging_steps"], global_step
+                            )
+
                        logging_loss = loss_metric.result()

                    with writer.as_default():
                        tf.summary.scalar("loss", loss_metric.result(), step=step)

-                    if args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
+                    if args["save_steps"] > 0 and global_step % args["save_steps"] == 0:
                        # Save model checkpoint
-                        output_dir = os.path.join(args['output_dir'], "checkpoint-{}".format(global_step))
+                        output_dir = os.path.join(args["output_dir"], "checkpoint-{}".format(global_step))

                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
-                        
+
                        model.save_pretrained(output_dir)
                        logging.info("Saving model checkpoint to %s", output_dir)
-                
-                train_iterator.child.comment = f'loss : {loss_metric.result()}'
+
+                train_iterator.child.comment = f"loss : {loss_metric.result()}"
                step += 1

-        train_iterator.write(f'loss epoch {epoch + 1}: {loss_metric.result()}')
+        train_iterator.write(f"loss epoch {epoch + 1}: {loss_metric.result()}")

        loss_metric.reset_states()

@@ -322,13 +299,15 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l


 def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode):
-    eval_batch_size = args['per_device_eval_batch_size'] * args['n_device']
-    eval_dataset, size = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode)
+    eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"]
+    eval_dataset, size = load_and_cache_examples(
+        args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode
+    )
    eval_dataset = strategy.experimental_distribute_dataset(eval_dataset)
    preds = None
    num_eval_steps = math.ceil(size / eval_batch_size)
    master = master_bar(range(1))
-    eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args['n_device'] > 1)
+    eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args["n_device"] > 1)
    loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
    loss = 0.0

@@ -337,15 +316,17 @@ def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode)
    logging.info("  Batch size = %d", eval_batch_size)

    for eval_features, eval_labels in eval_iterator:
-        inputs = {'attention_mask': eval_features['input_mask'], 'training': False}
+        inputs = {"attention_mask": eval_features["input_mask"], "training": False}

-        if args['model_type'] != "distilbert":
-            inputs["token_type_ids"] = eval_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None
+        if args["model_type"] != "distilbert":
+            inputs["token_type_ids"] = (
+                eval_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None
+            )

        with strategy.scope():
-            logits = model(eval_features['input_ids'], **inputs)[0]
+            logits = model(eval_features["input_ids"], **inputs)[0]
            tmp_logits = tf.reshape(logits, (-1, len(labels) + 1))
-            active_loss = tf.reshape(eval_features['input_mask'], (-1,))
+            active_loss = tf.reshape(eval_features["input_mask"], (-1,))
            active_logits = tf.boolean_mask(tmp_logits, active_loss)
            tmp_eval_labels = tf.reshape(eval_labels, (-1,))
            active_labels = tf.boolean_mask(tmp_eval_labels, active_loss)
@@ -384,11 +365,11 @@ def load_cache(cached_file, max_seq_length):
    def _decode_record(record):
        example = tf.io.parse_single_example(record, name_to_features)
        features = {}
-        features['input_ids'] = example['input_ids']
-        features['input_mask'] = example['input_mask']
-        features['segment_ids'] = example['segment_ids']
+        features["input_ids"] = example["input_ids"]
+        features["input_mask"] = example["input_mask"]
+        features["segment_ids"] = example["segment_ids"]

-        return features, example['label_ids']
+        return features, example["label_ids"]

    d = tf.data.TFRecordDataset(cached_file)
    d = d.map(_decode_record, num_parallel_calls=4)
@@ -422,39 +403,46 @@ def save_cache(features, cached_features_file):


 def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_size, mode):
-    drop_remainder = True if args['tpu'] or mode == 'train' else False
+    drop_remainder = True if args["tpu"] or mode == "train" else False

    # Load data features from cache or dataset file
-    cached_features_file = os.path.join(args['data_dir'], "cached_{}_{}_{}.tf_record".format(mode,
-        list(filter(None, args['model_name_or_path'].split("/"))).pop(),
-        str(args['max_seq_length'])))
-    if os.path.exists(cached_features_file) and not args['overwrite_cache']:
+    cached_features_file = os.path.join(
+        args["data_dir"],
+        "cached_{}_{}_{}.tf_record".format(
+            mode, list(filter(None, args["model_name_or_path"].split("/"))).pop(), str(args["max_seq_length"])
+        ),
+    )
+    if os.path.exists(cached_features_file) and not args["overwrite_cache"]:
        logging.info("Loading features from cached file %s", cached_features_file)
-        dataset, size = load_cache(cached_features_file, args['max_seq_length'])
+        dataset, size = load_cache(cached_features_file, args["max_seq_length"])
    else:
-        logging.info("Creating features from dataset file at %s", args['data_dir'])
-        examples = read_examples_from_file(args['data_dir'], mode)
-        features = convert_examples_to_features(examples, labels, args['max_seq_length'], tokenizer,
-                                                cls_token_at_end=bool(args['model_type'] in ["xlnet"]),
-                                                # xlnet has a cls token at the end
-                                                cls_token=tokenizer.cls_token,
-                                                cls_token_segment_id=2 if args['model_type'] in ["xlnet"] else 0,
-                                                sep_token=tokenizer.sep_token,
-                                                sep_token_extra=bool(args['model_type'] in ["roberta"]),
-                                                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
-                                                pad_on_left=bool(args['model_type'] in ["xlnet"]),
-                                                # pad on the left for xlnet
-                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-                                                pad_token_segment_id=4 if args['model_type'] in ["xlnet"] else 0,
-                                                pad_token_label_id=pad_token_label_id
-                                                )
+        logging.info("Creating features from dataset file at %s", args["data_dir"])
+        examples = read_examples_from_file(args["data_dir"], mode)
+        features = convert_examples_to_features(
+            examples,
+            labels,
+            args["max_seq_length"],
+            tokenizer,
+            cls_token_at_end=bool(args["model_type"] in ["xlnet"]),
+            # xlnet has a cls token at the end
+            cls_token=tokenizer.cls_token,
+            cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0,
+            sep_token=tokenizer.sep_token,
+            sep_token_extra=bool(args["model_type"] in ["roberta"]),
+            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
+            pad_on_left=bool(args["model_type"] in ["xlnet"]),
+            # pad on the left for xlnet
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
+            pad_token_label_id=pad_token_label_id,
+        )
        logging.info("Saving features into cached file %s", cached_features_file)
        save_cache(features, cached_features_file)
-        dataset, size = load_cache(cached_features_file, args['max_seq_length'])
+        dataset, size = load_cache(cached_features_file, args["max_seq_length"])

-    if mode == 'train':
+    if mode == "train":
        dataset = dataset.repeat()
-        dataset = dataset.shuffle(buffer_size=8192, seed=args['seed'])
+        dataset = dataset.shuffle(buffer_size=8192, seed=args["seed"])

    dataset = dataset.batch(batch_size, drop_remainder)
    dataset = dataset.prefetch(buffer_size=batch_size)
@@ -466,98 +454,134 @@ def main(_):
    logging.set_verbosity(logging.INFO)
    args = flags.FLAGS.flag_values_dict()

-    if os.path.exists(args['output_dir']) and os.listdir(
-            args['output_dir']) and args['do_train'] and not args['overwrite_output_dir']:
+    if (
+        os.path.exists(args["output_dir"])
+        and os.listdir(args["output_dir"])
+        and args["do_train"]
+        and not args["overwrite_output_dir"]
+    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args['output_dir']))
+                args["output_dir"]
+            )
+        )

-    if args['fp16']:
+    if args["fp16"]:
        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})

-    if args['tpu']:
-        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args['tpu'])
+    if args["tpu"]:
+        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args["tpu"])
        tf.config.experimental_connect_to_cluster(resolver)
        tf.tpu.experimental.initialize_tpu_system(resolver)
        strategy = tf.distribute.experimental.TPUStrategy(resolver)
-        args['n_device'] = args['num_tpu_cores']
-    elif len(args['gpus'].split(',')) > 1:
-        args['n_device'] = len([f"/gpu:{gpu}" for gpu in args['gpus'].split(',')])
-        strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args['gpus'].split(',')])
-    elif args['no_cuda']:
-        args['n_device'] = 1
+        args["n_device"] = args["num_tpu_cores"]
+    elif len(args["gpus"].split(",")) > 1:
+        args["n_device"] = len([f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
+        strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
+    elif args["no_cuda"]:
+        args["n_device"] = 1
        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
    else:
-        args['n_device'] = len(args['gpus'].split(','))
-        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args['gpus'].split(',')[0])
+        args["n_device"] = len(args["gpus"].split(","))
+        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args["gpus"].split(",")[0])

-    logging.warning("n_device: %s, distributed training: %s, 16-bits training: %s",
-                   args['n_device'], bool(args['n_device'] > 1), args['fp16'])
+    logging.warning(
+        "n_device: %s, distributed training: %s, 16-bits training: %s",
+        args["n_device"],
+        bool(args["n_device"] > 1),
+        args["fp16"],
+    )

-    labels = get_labels(args['labels'])
+    labels = get_labels(args["labels"])
    num_labels = len(labels) + 1
    pad_token_label_id = 0
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]
-    config = config_class.from_pretrained(args['config_name'] if args['config_name'] else args['model_name_or_path'],
-                                          num_labels=num_labels,
-                                          cache_dir=args['cache_dir'] if args['cache_dir'] else None)
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args["model_type"]]
+    config = config_class.from_pretrained(
+        args["config_name"] if args["config_name"] else args["model_name_or_path"],
+        num_labels=num_labels,
+        cache_dir=args["cache_dir"] if args["cache_dir"] else None,
+    )

    logging.info("Training/evaluation parameters %s", args)

    # Training
-    if args['do_train']:
-        tokenizer = tokenizer_class.from_pretrained(args['tokenizer_name'] if args['tokenizer_name'] else args['model_name_or_path'],
-                                                    do_lower_case=args['do_lower_case'],
-                                                    cache_dir=args['cache_dir'] if args['cache_dir'] else None)
+    if args["do_train"]:
+        tokenizer = tokenizer_class.from_pretrained(
+            args["tokenizer_name"] if args["tokenizer_name"] else args["model_name_or_path"],
+            do_lower_case=args["do_lower_case"],
+            cache_dir=args["cache_dir"] if args["cache_dir"] else None,
+        )

        with strategy.scope():
-            model = model_class.from_pretrained(args['model_name_or_path'],
-                                                from_pt=bool(".bin" in args['model_name_or_path']),
-                                                config=config,
-                                                cache_dir=args['cache_dir'] if args['cache_dir'] else None)
+            model = model_class.from_pretrained(
+                args["model_name_or_path"],
+                from_pt=bool(".bin" in args["model_name_or_path"]),
+                config=config,
+                cache_dir=args["cache_dir"] if args["cache_dir"] else None,
+            )
            model.layers[-1].activation = tf.keras.activations.softmax

-        train_batch_size = args['per_device_train_batch_size'] * args['n_device']
-        train_dataset, num_train_examples = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train")
+        train_batch_size = args["per_device_train_batch_size"] * args["n_device"]
+        train_dataset, num_train_examples = load_and_cache_examples(
+            args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train"
+        )
        train_dataset = strategy.experimental_distribute_dataset(train_dataset)
-        train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id)
+        train(
+            args,
+            strategy,
+            train_dataset,
+            tokenizer,
+            model,
+            num_train_examples,
+            labels,
+            train_batch_size,
+            pad_token_label_id,
+        )

-        if not os.path.exists(args['output_dir']):
-            os.makedirs(args['output_dir'])
+        if not os.path.exists(args["output_dir"]):
+            os.makedirs(args["output_dir"])

-        logging.info("Saving model to %s", args['output_dir'])
+        logging.info("Saving model to %s", args["output_dir"])

-        model.save_pretrained(args['output_dir'])
-        tokenizer.save_pretrained(args['output_dir'])
+        model.save_pretrained(args["output_dir"])
+        tokenizer.save_pretrained(args["output_dir"])

    # Evaluation
-    if args['do_eval']:
-        tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
+    if args["do_eval"]:
+        tokenizer = tokenizer_class.from_pretrained(args["output_dir"], do_lower_case=args["do_lower_case"])
        checkpoints = []
        results = []

-        if args['eval_all_checkpoints']:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args['output_dir'] + "/**/" + TF2_WEIGHTS_NAME, recursive=True), key=lambda f: int(''.join(filter(str.isdigit, f)) or -1)))
-        
+        if args["eval_all_checkpoints"]:
+            checkpoints = list(
+                os.path.dirname(c)
+                for c in sorted(
+                    glob.glob(args["output_dir"] + "/**/" + TF2_WEIGHTS_NAME, recursive=True),
+                    key=lambda f: int("".join(filter(str.isdigit, f)) or -1),
+                )
+            )
+
        logging.info("Evaluate the following checkpoints: %s", checkpoints)

        if len(checkpoints) == 0:
-            checkpoints.append(args['output_dir'])
-        
+            checkpoints.append(args["output_dir"])
+
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if re.match(".*checkpoint-[0-9]", checkpoint) else "final"

            with strategy.scope():
                model = model_class.from_pretrained(checkpoint)

-            y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
+            y_true, y_pred, eval_loss = evaluate(
+                args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev"
+            )
            report = metrics.classification_report(y_true, y_pred, digits=4)

            if global_step:
                results.append({global_step + "_report": report, global_step + "_loss": eval_loss})

-        output_eval_file = os.path.join(args['output_dir'], "eval_results.txt")
-        
+        output_eval_file = os.path.join(args["output_dir"], "eval_results.txt")
+
        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
            for res in results:
                for key, val in res.items():
@@ -572,26 +596,28 @@ def main(_):
                        writer.write(report)
                        writer.write("\n")

-    if args['do_predict']:
-        tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
-        model = model_class.from_pretrained(args['output_dir'])
-        eval_batch_size = args['per_device_eval_batch_size'] * args['n_device']
-        predict_dataset, _ = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test")
+    if args["do_predict"]:
+        tokenizer = tokenizer_class.from_pretrained(args["output_dir"], do_lower_case=args["do_lower_case"])
+        model = model_class.from_pretrained(args["output_dir"])
+        eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"]
+        predict_dataset, _ = load_and_cache_examples(
+            args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test"
+        )
        y_true, y_pred, pred_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="test")
-        output_test_results_file = os.path.join(args['output_dir'], "test_results.txt")
-        output_test_predictions_file = os.path.join(args['output_dir'], "test_predictions.txt")
+        output_test_results_file = os.path.join(args["output_dir"], "test_results.txt")
+        output_test_predictions_file = os.path.join(args["output_dir"], "test_predictions.txt")
        report = metrics.classification_report(y_true, y_pred, digits=4)

        with tf.io.gfile.GFile(output_test_results_file, "w") as writer:
            report = metrics.classification_report(y_true, y_pred, digits=4)
-            
+
            logging.info("\n" + report)
-            
+
            writer.write(report)
            writer.write("\n\nloss = " + str(pred_loss))

        with tf.io.gfile.GFile(output_test_predictions_file, "w") as writer:
-            with tf.io.gfile.GFile(os.path.join(args['data_dir'], "test.txt"), "r") as f:
+            with tf.io.gfile.GFile(os.path.join(args["data_dir"], "test.txt"), "r") as f:
                example_id = 0

                for line in f:
--- a/examples/run_xnli.py
+++ b/examples/run_xnli.py
@@ -26,8 +26,7 @@ import random

 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler

 try:
@@ -37,10 +36,18 @@ except:

 from tqdm import tqdm, trange

-from transformers import (WEIGHTS_NAME, 
-                          BertConfig, BertForSequenceClassification, BertTokenizer,
-                          XLMConfig, XLMForSequenceClassification, XLMTokenizer,
-                          DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForSequenceClassification,
+    BertTokenizer,
+    XLMConfig,
+    XLMForSequenceClassification,
+    XLMTokenizer,
+    DistilBertConfig,
+    DistilBertForSequenceClassification,
+    DistilBertTokenizer,
+)

 from transformers import AdamW, get_linear_schedule_with_warmup

@@ -52,12 +59,14 @@ from transformers import glue_convert_examples_to_features as convert_examples_t

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, DistilBertConfig, XLMConfig)), ())
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, DistilBertConfig, XLMConfig)), ()
+)

 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
-    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
+    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
+    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
 }


@@ -85,19 +94,26 @@ def train(args, train_dataset, model, tokenizer):
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )

    # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
        # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
@@ -112,17 +128,21 @@ def train(args, train_dataset, model, tokenizer):

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

@@ -132,7 +152,7 @@ def train(args, train_dataset, model, tokenizer):
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
+        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

@@ -143,7 +163,9 @@ def train(args, train_dataset, model, tokenizer):

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
-    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
@@ -155,16 +177,16 @@ def train(args, train_dataset, model, tokenizer):

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      'labels':         batch[3]}
-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None  # XLM and DistilBERT don't use segment_ids
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = (
+                    batch[2] if args.model_type in ["bert"] else None
+                )  # XLM and DistilBERT don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

@@ -188,28 +210,32 @@ def train(args, train_dataset, model, tokenizer):

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
@@ -258,11 +284,11 @@ def evaluate(args, model, tokenizer, prefix=""):
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
-                inputs = {'input_ids':      batch[0],
-                          'attention_mask': batch[1],
-                          'labels':         batch[3]}
-                if args.model_type != 'distilbert':
-                    inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None  # XLM and DistilBERT don't use segment_ids
+                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+                if args.model_type != "distilbert":
+                    inputs["token_type_ids"] = (
+                        batch[2] if args.model_type in ["bert"] else None
+                    )  # XLM and DistilBERT don't use segment_ids
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

@@ -270,16 +296,16 @@ def evaluate(args, model, tokenizer, prefix=""):
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs['labels'].detach().cpu().numpy()
+                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        if args.output_mode == "classification":
            preds = np.argmax(preds, axis=1)
        else:
-            raise ValueError('No other `output_mode` for XNLI.')
+            raise ValueError("No other `output_mode` for XNLI.")
        result = compute_metrics(eval_task, preds, out_label_ids)
        results.update(result)

@@ -300,27 +326,34 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    processor = processors[task](language=args.language, train_language=args.train_language)
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
-    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}_{}'.format(
-        'test' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length),
-        str(task),
-        str(args.train_language if (not evaluate and args.train_language is not None) else args.language)))
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}_{}_{}".format(
+            "test" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+            str(task),
+            str(args.train_language if (not evaluate and args.train_language is not None) else args.language),
+        ),
+    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
-        examples = processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-        features = convert_examples_to_features(examples,
-                                                tokenizer,
-                                                label_list=label_list,
-                                                max_length=args.max_seq_length,
-                                                output_mode=output_mode,
-                                                pad_on_left=False,
-                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-                                                pad_token_segment_id=0,
+        examples = (
+            processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+        )
+        features = convert_examples_to_features(
+            examples,
+            tokenizer,
+            label_list=label_list,
+            max_length=args.max_seq_length,
+            output_mode=output_mode,
+            pad_on_left=False,
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=0,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@@ -336,7 +369,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    if output_mode == "classification":
        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    else:
-        raise ValueError('No other `output_mode` for XNLI.')
+        raise ValueError("No other `output_mode` for XNLI.")

    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset
@@ -346,92 +379,152 @@ def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--language", default=None, type=str, required=True,
-                        help="Evaluation language. Also train language if `train_language` is set to None.")
-    parser.add_argument("--train_language", default=None, type=str,
-                        help="Train language if is different of the evaluation language.")
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--language",
+        default=None,
+        type=str,
+        required=True,
+        help="Evaluation language. Also train language if `train_language` is set to None.",
+    )
+    parser.add_argument(
+        "--train_language", default=None, type=str, help="Train language if is different of the evaluation language."
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )

    ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the test set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the test set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )

-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    args = parser.parse_args()

-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
+
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -443,22 +536,30 @@ def main():
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )

    # Set seed
    set_seed(args)

    # Prepare XNLI task
-    args.task_name = 'xnli'
+    args.task_name = "xnli"
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name](language=args.language, train_language=args.train_language)
@@ -472,17 +573,23 @@ def main():

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels,
-                                          finetuning_task=args.task_name,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -491,14 +598,12 @@ def main():

    logger.info("Training/evaluation parameters %s", args)

-
    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

-
    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
@@ -508,36 +613,39 @@ def main():
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

-
    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results
--- a/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
+++ b/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
@@ -34,12 +34,30 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)


-SAMPLE_TEXT = 'Hello world! cécé herlolip'
+SAMPLE_TEXT = "Hello world! cécé herlolip"


 BertAbsConfig = namedtuple(
    "BertAbsConfig",
-    ["temp_dir", "large", "use_bert_emb", "finetune_bert", "encoder", "share_emb", "max_pos", "enc_layers", "enc_hidden_size", "enc_heads", "enc_ff_size", "enc_dropout", "dec_layers", "dec_hidden_size", "dec_heads", "dec_ff_size", "dec_dropout"],
+    [
+        "temp_dir",
+        "large",
+        "use_bert_emb",
+        "finetune_bert",
+        "encoder",
+        "share_emb",
+        "max_pos",
+        "enc_layers",
+        "enc_hidden_size",
+        "enc_heads",
+        "enc_ff_size",
+        "enc_dropout",
+        "dec_layers",
+        "dec_hidden_size",
+        "dec_heads",
+        "dec_ff_size",
+        "dec_dropout",
+    ],
 )


@@ -119,7 +137,9 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
    output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls)[0]
    output_original_generator = original.generator(output_original_model)

-    output_converted_model = new_model(encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask)[0]
+    output_converted_model = new_model(
+        encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask
+    )[0]
    output_converted_generator = new_model.generator(output_converted_model)

    maximum_absolute_difference = torch.max(torch.abs(output_converted_model - output_original_model)).item()
@@ -136,28 +156,21 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
    # The model has been saved with torch.save(model) and this is bound to the exact
    # directory structure. We save the state_dict instead.
    logging.info("saving the model's state dictionary")
-    torch.save(new_model.state_dict(), "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin")
+    torch.save(
+        new_model.state_dict(), "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin"
+    )


 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
-        "--bertabs_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path the official PyTorch dump.",
+        "--bertabs_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump.",
    )
    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the output PyTorch model.",
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model.",
    )
    args = parser.parse_args()

    convert_bertabs_checkpoints(
-        args.bertabs_checkpoint_path,
-        args.pytorch_dump_folder_path,
+        args.bertabs_checkpoint_path, args.pytorch_dump_folder_path,
    )
--- a/examples/summarization/modeling_bertabs.py
+++ b/examples/summarization/modeling_bertabs.py
@@ -56,40 +56,22 @@ class BertAbs(BertAbsPreTrainedModel):
        load_bert_pretrained_extractive = True if bert_extractive_checkpoint else False
        if load_bert_pretrained_extractive:
            self.bert.model.load_state_dict(
-                dict(
-                    [
-                        (n[11:], p)
-                        for n, p in bert_extractive_checkpoint.items()
-                        if n.startswith("bert.model")
-                    ]
-                ),
+                dict([(n[11:], p) for n, p in bert_extractive_checkpoint.items() if n.startswith("bert.model")]),
                strict=True,
            )

        self.vocab_size = self.bert.model.config.vocab_size

        if args.max_pos > 512:
-            my_pos_embeddings = nn.Embedding(
-                args.max_pos, self.bert.model.config.hidden_size
-            )
-            my_pos_embeddings.weight.data[
-                :512
-            ] = self.bert.model.embeddings.position_embeddings.weight.data
-            my_pos_embeddings.weight.data[
-                512:
-            ] = self.bert.model.embeddings.position_embeddings.weight.data[-1][
+            my_pos_embeddings = nn.Embedding(args.max_pos, self.bert.model.config.hidden_size)
+            my_pos_embeddings.weight.data[:512] = self.bert.model.embeddings.position_embeddings.weight.data
+            my_pos_embeddings.weight.data[512:] = self.bert.model.embeddings.position_embeddings.weight.data[-1][
                None, :
-            ].repeat(
-                args.max_pos - 512, 1
-            )
+            ].repeat(args.max_pos - 512, 1)
            self.bert.model.embeddings.position_embeddings = my_pos_embeddings
-        tgt_embeddings = nn.Embedding(
-            self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0
-        )
+        tgt_embeddings = nn.Embedding(self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0)

-        tgt_embeddings.weight = copy.deepcopy(
-            self.bert.model.embeddings.word_embeddings.weight
-        )
+        tgt_embeddings.weight = copy.deepcopy(self.bert.model.embeddings.word_embeddings.weight)

        self.decoder = TransformerDecoder(
            self.args.dec_layers,
@@ -102,9 +84,7 @@ class BertAbs(BertAbsPreTrainedModel):
        )

        gen_func = nn.LogSoftmax(dim=-1)
-        self.generator = nn.Sequential(
-            nn.Linear(args.dec_hidden_size, args.vocab_size), gen_func
-        )
+        self.generator = nn.Sequential(nn.Linear(args.dec_hidden_size, args.vocab_size), gen_func)
        self.generator[0].weight = self.decoder.embeddings.weight

        load_from_checkpoints = False if checkpoint is None else True
@@ -127,25 +107,14 @@ class BertAbs(BertAbsPreTrainedModel):
                p.data.zero_()

    def forward(
-        self,
-        encoder_input_ids,
-        decoder_input_ids,
-        token_type_ids,
-        encoder_attention_mask,
-        decoder_attention_mask,
+        self, encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask,
    ):
        encoder_output = self.bert(
-            input_ids=encoder_input_ids,
-            token_type_ids=token_type_ids,
-            attention_mask=encoder_attention_mask,
+            input_ids=encoder_input_ids, token_type_ids=token_type_ids, attention_mask=encoder_attention_mask,
        )
        encoder_hidden_states = encoder_output[0]
-        dec_state = self.decoder.init_decoder_state(
-            encoder_input_ids, encoder_hidden_states
-        )
-        decoder_outputs, _ = self.decoder(
-            decoder_input_ids[:, :-1], encoder_hidden_states, dec_state
-        )
+        dec_state = self.decoder.init_decoder_state(encoder_input_ids, encoder_hidden_states)
+        decoder_outputs, _ = self.decoder(decoder_input_ids[:, :-1], encoder_hidden_states, dec_state)
        return decoder_outputs


@@ -162,10 +131,7 @@ class Bert(nn.Module):
        self.eval()
        with torch.no_grad():
            encoder_outputs, _ = self.model(
-                input_ids,
-                token_type_ids=token_type_ids,
-                attention_mask=attention_mask,
-                **kwargs
+                input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, **kwargs
            )
        return encoder_outputs

@@ -196,10 +162,7 @@ class TransformerDecoder(nn.Module):

        # Build TransformerDecoder.
        self.transformer_layers = nn.ModuleList(
-            [
-                TransformerDecoderLayer(d_model, heads, d_ff, dropout)
-                for _ in range(num_layers)
-            ]
+            [TransformerDecoderLayer(d_model, heads, d_ff, dropout) for _ in range(num_layers)]
        )

        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
@@ -236,20 +199,14 @@ class TransformerDecoder(nn.Module):
        # Decoder padding mask
        tgt_words = tgt
        tgt_batch, tgt_len = tgt_words.size()
-        tgt_pad_mask = (
-            tgt_words.data.eq(padding_idx).unsqueeze(1).expand(tgt_batch, tgt_len, tgt_len)
-        )
+        tgt_pad_mask = tgt_words.data.eq(padding_idx).unsqueeze(1).expand(tgt_batch, tgt_len, tgt_len)

        # Encoder padding mask
        if memory_mask is not None:
            src_len = memory_mask.size(-1)
            src_pad_mask = memory_mask.expand(src_batch, tgt_len, src_len)
        else:
-            src_pad_mask = (
-                src_words.data.eq(padding_idx)
-                .unsqueeze(1)
-                .expand(src_batch, tgt_len, src_len)
-            )
+            src_pad_mask = src_words.data.eq(padding_idx).unsqueeze(1).expand(src_batch, tgt_len, src_len)

        # Pass through the embeddings
        emb = self.embeddings(input_ids)
@@ -271,9 +228,7 @@ class TransformerDecoder(nn.Module):
                src_pad_mask,
                tgt_pad_mask,
                previous_input=prev_layer_input,
-                layer_cache=state.cache["layer_{}".format(i)]
-                if state.cache is not None
-                else None,
+                layer_cache=state.cache["layer_{}".format(i)] if state.cache is not None else None,
                step=step,
            )
            if state.cache is None:
@@ -303,9 +258,7 @@ class PositionalEncoding(nn.Module):
    def __init__(self, dropout, dim, max_len=5000):
        pe = torch.zeros(max_len, dim)
        position = torch.arange(0, max_len).unsqueeze(1)
-        div_term = torch.exp(
-            (torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim))
-        )
+        div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim)))
        pe[:, 0::2] = torch.sin(position.float() * div_term)
        pe[:, 1::2] = torch.cos(position.float() * div_term)
        pe = pe.unsqueeze(0)
@@ -356,14 +309,7 @@ class TransformerDecoderLayer(nn.Module):
        self.register_buffer("mask", mask)

    def forward(
-        self,
-        inputs,
-        memory_bank,
-        src_pad_mask,
-        tgt_pad_mask,
-        previous_input=None,
-        layer_cache=None,
-        step=None,
+        self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, previous_input=None, layer_cache=None, step=None,
    ):
        """
        Args:
@@ -380,34 +326,20 @@ class TransformerDecoderLayer(nn.Module):
            * all_input `[batch_size x current_step x model_dim]`

        """
-        dec_mask = torch.gt(
-            tgt_pad_mask + self.mask[:, : tgt_pad_mask.size(1), : tgt_pad_mask.size(1)], 0
-        )
+        dec_mask = torch.gt(tgt_pad_mask + self.mask[:, : tgt_pad_mask.size(1), : tgt_pad_mask.size(1)], 0)
        input_norm = self.layer_norm_1(inputs)
        all_input = input_norm
        if previous_input is not None:
            all_input = torch.cat((previous_input, input_norm), dim=1)
            dec_mask = None

-        query = self.self_attn(
-            all_input,
-            all_input,
-            input_norm,
-            mask=dec_mask,
-            layer_cache=layer_cache,
-            type="self",
-        )
+        query = self.self_attn(all_input, all_input, input_norm, mask=dec_mask, layer_cache=layer_cache, type="self",)

        query = self.drop(query) + inputs

        query_norm = self.layer_norm_2(query)
        mid = self.context_attn(
-            memory_bank,
-            memory_bank,
-            query_norm,
-            mask=src_pad_mask,
-            layer_cache=layer_cache,
-            type="context",
+            memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, type="context",
        )
        output = self.feed_forward(self.drop(mid) + query)

@@ -492,14 +424,7 @@ class MultiHeadedAttention(nn.Module):
            self.final_linear = nn.Linear(model_dim, model_dim)

    def forward(
-        self,
-        key,
-        value,
-        query,
-        mask=None,
-        layer_cache=None,
-        type=None,
-        predefined_graph_1=None,
+        self, key, value, query, mask=None, layer_cache=None, type=None, predefined_graph_1=None,
    ):
        """
        Compute the context vector and the attention vectors.
@@ -531,11 +456,7 @@ class MultiHeadedAttention(nn.Module):

        def unshape(x):
            """  compute context """
-            return (
-                x.transpose(1, 2)
-                .contiguous()
-                .view(batch_size, -1, head_count * dim_per_head)
-            )
+            return x.transpose(1, 2).contiguous().view(batch_size, -1, head_count * dim_per_head)

        # 1) Project key, value, and query.
        if layer_cache is not None:
@@ -554,9 +475,7 @@ class MultiHeadedAttention(nn.Module):
                    if layer_cache["self_keys"] is not None:
                        key = torch.cat((layer_cache["self_keys"].to(device), key), dim=2)
                    if layer_cache["self_values"] is not None:
-                        value = torch.cat(
-                            (layer_cache["self_values"].to(device), value), dim=2
-                        )
+                        value = torch.cat((layer_cache["self_values"].to(device), value), dim=2)
                    layer_cache["self_keys"] = key
                    layer_cache["self_values"] = value
            elif type == "context":
@@ -637,13 +556,9 @@ class DecoderState(object):
            sizes = e.size()
            br = sizes[1]
            if len(sizes) == 3:
-                sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2])[
-                    :, :, idx
-                ]
+                sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2])[:, :, idx]
            else:
-                sent_states = e.view(
-                    sizes[0], beam_size, br // beam_size, sizes[2], sizes[3]
-                )[:, :, idx]
+                sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2], sizes[3])[:, :, idx]

            sent_states.data.copy_(sent_states.data.index_select(1, positions))

@@ -716,11 +631,7 @@ class TransformerDecoderState(DecoderState):


 def gelu(x):
-    return (
-        0.5
-        * x
-        * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-    )
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))


 class PositionwiseFeedForward(nn.Module):
@@ -758,9 +669,7 @@ class PositionwiseFeedForward(nn.Module):
 def build_predictor(args, tokenizer, symbols, model, logger=None):
    # we should be able to refactor the global scorer a lot
    scorer = GNMTGlobalScorer(args.alpha, length_penalty="wu")
-    translator = Translator(
-        args, model, tokenizer, symbols, global_scorer=scorer, logger=logger
-    )
+    translator = Translator(args, model, tokenizer, symbols, global_scorer=scorer, logger=logger)
    return translator


@@ -891,9 +800,7 @@ class Translator(object):
           Shouldn't need the original dataset.
        """
        with torch.no_grad():
-            return self._fast_translate_batch(
-                batch, self.max_length, min_length=self.min_length
-            )
+            return self._fast_translate_batch(batch, self.max_length, min_length=self.min_length)

    # Where the beam search lives
    # I have no idea why it is being called from the method above
@@ -912,26 +819,18 @@ class Translator(object):
        mask_src = batch.mask_src

        src_features = self.model.bert(src, segs, mask_src)
-        dec_states = self.model.decoder.init_decoder_state(
-            src, src_features, with_cache=True
-        )
+        dec_states = self.model.decoder.init_decoder_state(src, src_features, with_cache=True)
        device = src_features.device

        # Tile states and memory beam_size times.
        dec_states.map_batch_fn(lambda state, dim: tile(state, beam_size, dim=dim))
        src_features = tile(src_features, beam_size, dim=0)
        batch_offset = torch.arange(batch_size, dtype=torch.long, device=device)
-        beam_offset = torch.arange(
-            0, batch_size * beam_size, step=beam_size, dtype=torch.long, device=device
-        )
-        alive_seq = torch.full(
-            [batch_size * beam_size, 1], self.start_token, dtype=torch.long, device=device
-        )
+        beam_offset = torch.arange(0, batch_size * beam_size, step=beam_size, dtype=torch.long, device=device)
+        alive_seq = torch.full([batch_size * beam_size, 1], self.start_token, dtype=torch.long, device=device)

        # Give full probability to the first beam on the first step.
-        topk_log_probs = torch.tensor(
-            [0.0] + [float("-inf")] * (beam_size - 1), device=device
-        ).repeat(batch_size)
+        topk_log_probs = torch.tensor([0.0] + [float("-inf")] * (beam_size - 1), device=device).repeat(batch_size)

        # Structure that holds finished hypotheses.
        hypotheses = [[] for _ in range(batch_size)]  # noqa: F812
@@ -948,9 +847,7 @@ class Translator(object):
            # Decoder forward.
            decoder_input = decoder_input.transpose(0, 1)

-            dec_out, dec_states = self.model.decoder(
-                decoder_input, src_features, dec_states, step=step
-            )
+            dec_out, dec_states = self.model.decoder(decoder_input, src_features, dec_states, step=step)

            # Generator forward.
            log_probs = self.generator.forward(dec_out.transpose(0, 1).squeeze(0))
@@ -978,10 +875,7 @@ class Translator(object):
                        words = " ".join(words).replace(" ##", "").split()
                        if len(words) <= 3:
                            continue
-                        trigrams = [
-                            (words[i - 1], words[i], words[i + 1])
-                            for i in range(1, len(words) - 1)
-                        ]
+                        trigrams = [(words[i - 1], words[i], words[i + 1]) for i in range(1, len(words) - 1)]
                        trigram = tuple(trigrams[-1])
                        if trigram in trigrams[:-1]:
                            fail = True
@@ -999,15 +893,11 @@ class Translator(object):
            topk_ids = topk_ids.fmod(vocab_size)

            # Map beam_index to batch_index in the flat representation.
-            batch_index = topk_beam_index + beam_offset[
-                : topk_beam_index.size(0)
-            ].unsqueeze(1)
+            batch_index = topk_beam_index + beam_offset[: topk_beam_index.size(0)].unsqueeze(1)
            select_indices = batch_index.view(-1)

            # Append last prediction.
-            alive_seq = torch.cat(
-                [alive_seq.index_select(0, select_indices), topk_ids.view(-1, 1)], -1
-            )
+            alive_seq = torch.cat([alive_seq.index_select(0, select_indices), topk_ids.view(-1, 1)], -1)

            is_finished = topk_ids.eq(self.end_token)
            if step + 1 == max_length:
@@ -1040,15 +930,11 @@ class Translator(object):
                topk_log_probs = topk_log_probs.index_select(0, non_finished)
                batch_index = batch_index.index_select(0, non_finished)
                batch_offset = batch_offset.index_select(0, non_finished)
-                alive_seq = predictions.index_select(0, non_finished).view(
-                    -1, alive_seq.size(-1)
-                )
+                alive_seq = predictions.index_select(0, non_finished).view(-1, alive_seq.size(-1))
            # Reorder states.
            select_indices = batch_index.view(-1)
            src_features = src_features.index_select(0, select_indices)
-            dec_states.map_batch_fn(
-                lambda state, dim: state.index_select(dim, select_indices)
-            )
+            dec_states.map_batch_fn(lambda state, dim: state.index_select(dim, select_indices))

        return results

@@ -1089,14 +975,7 @@ def tile(x, count, dim=0):
    out_size = list(x.size())
    out_size[0] *= count
    batch = x.size(0)
-    x = (
-        x.view(batch, -1)
-        .transpose(0, 1)
-        .repeat(count, 1)
-        .transpose(0, 1)
-        .contiguous()
-        .view(*out_size)
-    )
+    x = x.view(batch, -1).transpose(0, 1).repeat(count, 1).transpose(0, 1).contiguous().view(*out_size)
    if dim != 0:
        x = x.permute(perm).contiguous()
    return x
@@ -1107,6 +986,7 @@ def tile(x, count, dim=0):
 # a finetuning script.
 #

+
 class BertSumOptimizer(object):
    """ Specific optimizer for BertSum.

@@ -1126,16 +1006,10 @@ class BertSumOptimizer(object):

        self.optimizers = {
            "encoder": torch.optim.Adam(
-                model.encoder.parameters(),
-                lr=lr["encoder"],
-                betas=(beta_1, beta_2),
-                eps=eps,
+                model.encoder.parameters(), lr=lr["encoder"], betas=(beta_1, beta_2), eps=eps,
            ),
            "decoder": torch.optim.Adam(
-                model.decoder.parameters(),
-                lr=lr["decoder"],
-                betas=(beta_1, beta_2),
-                eps=eps,
+                model.decoder.parameters(), lr=lr["decoder"], betas=(beta_1, beta_2), eps=eps,
            ),
        }

@@ -1143,9 +1017,7 @@ class BertSumOptimizer(object):
        self.current_learning_rates = {}

    def _update_rate(self, stack):
-        return self.lr[stack] * min(
-            self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-1.5)
-        )
+        return self.lr[stack] * min(self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-1.5))

    def zero_grad(self):
        self.optimizer_decoder.zero_grad()
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -25,9 +25,7 @@ logger = logging.getLogger(__name__)
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)


-Batch = namedtuple(
-    "Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"]
-)
+Batch = namedtuple("Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"])


 def evaluate(args):
@@ -48,13 +46,14 @@ def evaluate(args):

        import rouge
        import nltk
-        nltk.download('punkt')
+
+        nltk.download("punkt")
        rouge_evaluator = rouge.Rouge(
-            metrics=['rouge-n', 'rouge-l'],
+            metrics=["rouge-n", "rouge-l"],
            max_n=2,
            limit_length=True,
            length_limit=args.beam_size,
-            length_limit_type='words',
+            length_limit_type="words",
            apply_avg=True,
            apply_best=False,
            alpha=0.5,  # Default F1_score
@@ -161,15 +160,15 @@ Recall    >> {:.3f}
 F1        >> {:.3f}
 Precision >> {:.3f}
 Recall    >> {:.3f}""".format(
-        scores['rouge-1']['f'],
-        scores['rouge-1']['p'],
-        scores['rouge-1']['r'],
-        scores['rouge-2']['f'],
-        scores['rouge-2']['p'],
-        scores['rouge-2']['r'],
-        scores['rouge-l']['f'],
-        scores['rouge-l']['p'],
-        scores['rouge-l']['r'],
+        scores["rouge-1"]["f"],
+        scores["rouge-1"]["p"],
+        scores["rouge-1"]["r"],
+        scores["rouge-2"]["f"],
+        scores["rouge-2"]["p"],
+        scores["rouge-2"]["r"],
+        scores["rouge-l"]["f"],
+        scores["rouge-l"]["p"],
+        scores["rouge-l"]["r"],
    )


@@ -187,9 +186,7 @@ def build_data_iterator(args, tokenizer):
    dataset = load_and_cache_examples(args, tokenizer)
    sampler = SequentialSampler(dataset)
    collate_fn = lambda data: collate(data, tokenizer, block_size=512, device=args.device)
-    iterator = DataLoader(
-        dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,
-    )
+    iterator = DataLoader(dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,)

    return iterator

@@ -210,14 +207,9 @@ def collate(data, tokenizer, block_size, device):
    names = [name for name, _, _ in data]
    summaries = [" ".join(summary_list) for _, _, summary_list in data]

-    encoded_text = [
-        encode_for_summarization(story, summary, tokenizer) for _, story, summary in data
-    ]
+    encoded_text = [encode_for_summarization(story, summary, tokenizer) for _, story, summary in data]
    encoded_stories = torch.tensor(
-        [
-            fit_to_block_size(story, block_size, tokenizer.pad_token_id)
-            for story, _ in encoded_text
-        ]
+        [fit_to_block_size(story, block_size, tokenizer.pad_token_id) for story, _ in encoded_text]
    )
    encoder_token_type_ids = compute_token_type_ids(encoded_stories, tokenizer.cls_token_id)
    encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id)
@@ -272,38 +264,23 @@ def main():
    )
    # EVALUATION options
    parser.add_argument(
-        "--no_cuda",
-        default=False,
-        type=bool,
-        help="Whether to force the execution on CPU.",
+        "--no_cuda", default=False, type=bool, help="Whether to force the execution on CPU.",
    )
    parser.add_argument(
        "--batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.",
    )
    # BEAM SEARCH arguments
    parser.add_argument(
-        "--min_length",
-        default=50,
-        type=int,
-        help="Minimum number of tokens for the summaries.",
+        "--min_length", default=50, type=int, help="Minimum number of tokens for the summaries.",
    )
    parser.add_argument(
-        "--max_length",
-        default=200,
-        type=int,
-        help="Maixmum number of tokens for the summaries.",
+        "--max_length", default=200, type=int, help="Maixmum number of tokens for the summaries.",
    )
    parser.add_argument(
-        "--beam_size",
-        default=5,
-        type=int,
-        help="The number of beams to start with for each example.",
+        "--beam_size", default=5, type=int, help="The number of beams to start with for each example.",
    )
    parser.add_argument(
-        "--alpha",
-        default=0.95,
-        type=float,
-        help="The value of alpha for the length penalty in the beam search.",
+        "--alpha", default=0.95, type=float, help="The value of alpha for the length penalty in the beam search.",
    )
    parser.add_argument(
        "--block_trigram",
--- a/examples/summarization/utils_summarization.py
+++ b/examples/summarization/utils_summarization.py
@@ -68,9 +68,7 @@ def process_story(raw_story):
    Raises:
        IndexError: If the stoy is empty or contains no highlights.
    """
-    nonempty_lines = list(
-        filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")])
-    )
+    nonempty_lines = list(filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")]))

    # for some unknown reason some lines miss a period, add it
    nonempty_lines = [_add_missing_period(line) for line in nonempty_lines]
@@ -135,13 +133,9 @@ def encode_for_summarization(story_lines, summary_lines, tokenizer):
    sentences.
    """
    story_lines_token_ids = [tokenizer.encode(line) for line in story_lines]
-    story_token_ids = [
-        token for sentence in story_lines_token_ids for token in sentence
-    ]
+    story_token_ids = [token for sentence in story_lines_token_ids for token in sentence]
    summary_lines_token_ids = [tokenizer.encode(line) for line in summary_lines]
-    summary_token_ids = [
-        token for sentence in summary_lines_token_ids for token in sentence
-    ]
+    summary_token_ids = [token for sentence in summary_lines_token_ids for token in sentence]

    return story_token_ids, summary_token_ids

--- a/examples/summarization/utils_summarization_test.py
+++ b/examples/summarization/utils_summarization_test.py
@@ -33,25 +33,19 @@ class SummarizationDataProcessingTest(unittest.TestCase):
        """ Pad the sequence with 0 if the sequence is smaller than the block size."""
        sequence = [1, 2, 3, 4]
        expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]
-        self.assertEqual(
-            fit_to_block_size(sequence, self.block_size, 0), expected_output
-        )
+        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)

    def test_fit_to_block_sequence_fit_exactly(self):
        """ Do nothing if the sequence is the right size. """
        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        self.assertEqual(
-            fit_to_block_size(sequence, self.block_size, 0), expected_output
-        )
+        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)

    def test_fit_to_block_sequence_too_big(self):
        """ Truncate the sequence if it is too long. """
        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        self.assertEqual(
-            fit_to_block_size(sequence, self.block_size, 0), expected_output
-        )
+        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)

    def test_process_story_no_highlights(self):
        """ Processing a story with no highlights returns an empty list for the summary.
@@ -95,9 +89,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
    def test_build_mask(self):
        sequence = torch.tensor([1, 2, 3, 4, 23, 23, 23])
        expected = torch.tensor([1, 1, 1, 1, 0, 0, 0])
-        np.testing.assert_array_equal(
-            build_mask(sequence, 23).numpy(), expected.numpy()
-        )
+        np.testing.assert_array_equal(build_mask(sequence, 23).numpy(), expected.numpy())

    def test_build_mask_with_padding_equal_to_one(self):
        sequence = torch.tensor([8, 2, 3, 4, 1, 1, 1])
@@ -106,12 +98,8 @@ class SummarizationDataProcessingTest(unittest.TestCase):

    def test_compute_token_type_ids(self):
        separator = 101
-        batch = torch.tensor(
-            [[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]]
-        )
-        expected = torch.tensor(
-            [[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1]]
-        )
+        batch = torch.tensor([[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]])
+        expected = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1]])

        result = compute_token_type_ids(batch, separator)
        np.testing.assert_array_equal(result, expected)
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -35,34 +35,36 @@ logging.basicConfig(level=logging.DEBUG)

 logger = logging.getLogger()

+
 def get_setup_file():
    parser = argparse.ArgumentParser()
-    parser.add_argument('-f')
+    parser.add_argument("-f")
    args = parser.parse_args()
    return args.f

-class ExamplesTests(unittest.TestCase):

+class ExamplesTests(unittest.TestCase):
    def test_run_glue(self):
        stream_handler = logging.StreamHandler(sys.stdout)
        logger.addHandler(stream_handler)

-        testargs = ["run_glue.py",
-                    "--data_dir=./examples/tests_samples/MRPC/",
-                    "--task_name=mrpc",
-                    "--do_train",
-                    "--do_eval",
-                    "--output_dir=./examples/tests_samples/temp_dir",
-                    "--per_gpu_train_batch_size=2",
-                    "--per_gpu_eval_batch_size=1",
-                    "--learning_rate=1e-4",
-                    "--max_steps=10",
-                    "--warmup_steps=2",
-                    "--overwrite_output_dir",
-                    "--seed=42"]
-        model_type, model_name = ("--model_type=bert",
-                                  "--model_name_or_path=bert-base-uncased")
-        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
+        testargs = [
+            "run_glue.py",
+            "--data_dir=./examples/tests_samples/MRPC/",
+            "--task_name=mrpc",
+            "--do_train",
+            "--do_eval",
+            "--output_dir=./examples/tests_samples/temp_dir",
+            "--per_gpu_train_batch_size=2",
+            "--per_gpu_eval_batch_size=1",
+            "--learning_rate=1e-4",
+            "--max_steps=10",
+            "--warmup_steps=2",
+            "--overwrite_output_dir",
+            "--seed=42",
+        ]
+        model_type, model_name = ("--model_type=bert", "--model_name_or_path=bert-base-uncased")
+        with patch.object(sys, "argv", testargs + [model_type, model_name]):
            result = run_glue.main()
            for value in result.values():
                self.assertGreaterEqual(value, 0.75)
@@ -71,40 +73,38 @@ class ExamplesTests(unittest.TestCase):
        stream_handler = logging.StreamHandler(sys.stdout)
        logger.addHandler(stream_handler)

-        testargs = ["run_squad.py",
-                    "--data_dir=./examples/tests_samples/SQUAD",
-                    "--model_name=bert-base-uncased",
-                    "--output_dir=./examples/tests_samples/temp_dir",
-                    "--max_steps=10",
-                    "--warmup_steps=2",
-                    "--do_train",
-                    "--do_eval",
-                    "--version_2_with_negative",
-                    "--learning_rate=2e-4",
-                    "--per_gpu_train_batch_size=2",
-                    "--per_gpu_eval_batch_size=1",
-                    "--overwrite_output_dir",
-                    "--seed=42"]
-        model_type, model_name = ("--model_type=bert",
-                                  "--model_name_or_path=bert-base-uncased")
-        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
+        testargs = [
+            "run_squad.py",
+            "--data_dir=./examples/tests_samples/SQUAD",
+            "--model_name=bert-base-uncased",
+            "--output_dir=./examples/tests_samples/temp_dir",
+            "--max_steps=10",
+            "--warmup_steps=2",
+            "--do_train",
+            "--do_eval",
+            "--version_2_with_negative",
+            "--learning_rate=2e-4",
+            "--per_gpu_train_batch_size=2",
+            "--per_gpu_eval_batch_size=1",
+            "--overwrite_output_dir",
+            "--seed=42",
+        ]
+        model_type, model_name = ("--model_type=bert", "--model_name_or_path=bert-base-uncased")
+        with patch.object(sys, "argv", testargs + [model_type, model_name]):
            result = run_squad.main()
-            self.assertGreaterEqual(result['f1'], 30)
-            self.assertGreaterEqual(result['exact'], 30)
+            self.assertGreaterEqual(result["f1"], 30)
+            self.assertGreaterEqual(result["exact"], 30)

    def test_generation(self):
        stream_handler = logging.StreamHandler(sys.stdout)
        logger.addHandler(stream_handler)

-        testargs = ["run_generation.py",
-                    "--prompt=Hello",
-                    "--length=10",
-                    "--seed=42"]
-        model_type, model_name = ("--model_type=openai-gpt",
-                                  "--model_name_or_path=openai-gpt")
-        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
+        testargs = ["run_generation.py", "--prompt=Hello", "--length=10", "--seed=42"]
+        model_type, model_name = ("--model_type=openai-gpt", "--model_name_or_path=openai-gpt")
+        with patch.object(sys, "argv", testargs + [model_type, model_name]):
            result = run_generation.main()
            self.assertGreaterEqual(len(result), 10)

+
 if __name__ == "__main__":
    unittest.main()
--- a/examples/utils_multiple_choice.py
+++ b/examples/utils_multiple_choice.py
@@ -55,19 +55,10 @@ class InputExample(object):


 class InputFeatures(object):
-    def __init__(self,
-                 example_id,
-                 choices_features,
-                 label
-
-    ):
+    def __init__(self, example_id, choices_features, label):
        self.example_id = example_id
        self.choices_features = [
-            {
-                'input_ids': input_ids,
-                'input_mask': input_mask,
-                'segment_ids': segment_ids
-            }
+            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
            for input_ids, input_mask, segment_ids in choices_features
        ]
        self.label = label
@@ -99,29 +90,29 @@ class RaceProcessor(DataProcessor):
    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} train".format(data_dir))
-        high = os.path.join(data_dir, 'train/high')
-        middle = os.path.join(data_dir, 'train/middle')
+        high = os.path.join(data_dir, "train/high")
+        middle = os.path.join(data_dir, "train/middle")
        high = self._read_txt(high)
        middle = self._read_txt(middle)
-        return self._create_examples(high + middle, 'train')
+        return self._create_examples(high + middle, "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} dev".format(data_dir))
-        high = os.path.join(data_dir, 'dev/high')
-        middle = os.path.join(data_dir, 'dev/middle')
+        high = os.path.join(data_dir, "dev/high")
+        middle = os.path.join(data_dir, "dev/middle")
        high = self._read_txt(high)
        middle = self._read_txt(middle)
-        return self._create_examples(high + middle, 'dev')
+        return self._create_examples(high + middle, "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} test".format(data_dir))
-        high = os.path.join(data_dir, 'test/high')
-        middle = os.path.join(data_dir, 'test/middle')
+        high = os.path.join(data_dir, "test/high")
+        middle = os.path.join(data_dir, "test/middle")
        high = self._read_txt(high)
        middle = self._read_txt(middle)
-        return self._create_examples(high + middle, 'test')
+        return self._create_examples(high + middle, "test")

    def get_labels(self):
        """See base class."""
@@ -131,13 +122,12 @@ class RaceProcessor(DataProcessor):
        lines = []
        files = glob.glob(input_dir + "/*txt")
        for file in tqdm.tqdm(files, desc="read files"):
-            with open(file, 'r', encoding='utf-8') as fin:
+            with open(file, "r", encoding="utf-8") as fin:
                data_raw = json.load(fin)
                data_raw["race_id"] = file
                lines.append(data_raw)
        return lines

-
    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
@@ -145,19 +135,22 @@ class RaceProcessor(DataProcessor):
            race_id = "%s-%s" % (set_type, data_raw["race_id"])
            article = data_raw["article"]
            for i in range(len(data_raw["answers"])):
-                truth = str(ord(data_raw['answers'][i]) - ord('A'))
-                question = data_raw['questions'][i]
-                options = data_raw['options'][i]
+                truth = str(ord(data_raw["answers"][i]) - ord("A"))
+                question = data_raw["questions"][i]
+                options = data_raw["options"][i]

                examples.append(
                    InputExample(
                        example_id=race_id,
                        question=question,
-                        contexts=[article, article, article, article], # this is not efficient but convenient
+                        contexts=[article, article, article, article],  # this is not efficient but convenient
                        endings=[options[0], options[1], options[2], options[3]],
-                        label=truth))
+                        label=truth,
+                    )
+                )
        return examples

+
 class SwagProcessor(DataProcessor):
    """Processor for the SWAG data set."""

@@ -179,27 +172,25 @@ class SwagProcessor(DataProcessor):
            "setting!"
        )
        return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")
+
    def get_labels(self):
        """See base class."""
        return ["0", "1", "2", "3"]

    def _read_csv(self, input_file):
-        with open(input_file, 'r', encoding='utf-8') as f:
+        with open(input_file, "r", encoding="utf-8") as f:
            reader = csv.reader(f)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
-                    line = list(unicode(cell, 'utf-8') for cell in line)
+                    line = list(unicode(cell, "utf-8") for cell in line)
                lines.append(line)
            return lines

-
    def _create_examples(self, lines: List[List[str]], type: str):
        """Creates examples for the training and dev sets."""
-        if type == "train" and lines[0][-1] != 'label':
-            raise ValueError(
-                "For training, the input file must contain a label column."
-            )
+        if type == "train" and lines[0][-1] != "label":
+            raise ValueError("For training, the input file must contain a label column.")

        examples = [
            InputExample(
@@ -207,10 +198,11 @@ class SwagProcessor(DataProcessor):
                question=line[5],  # in the swag dataset, the
                # common beginning of each
                # choice is stored in "sent2".
-                contexts = [line[4], line[4], line[4], line[4]],
-                endings = [line[7], line[8], line[9], line[10]],
-                label=line[11]
-            ) for line in lines[1:]  # we skip the line with the column names
+                contexts=[line[4], line[4], line[4], line[4]],
+                endings=[line[7], line[8], line[9], line[10]],
+                label=line[11],
+            )
+            for line in lines[1:]  # we skip the line with the column names
        ]

        return examples
@@ -238,15 +230,14 @@ class ArcProcessor(DataProcessor):
        return ["0", "1", "2", "3"]

    def _read_json(self, input_file):
-        with open(input_file, 'r', encoding='utf-8') as fin:
+        with open(input_file, "r", encoding="utf-8") as fin:
            lines = fin.readlines()
            return lines

-
    def _create_examples(self, lines, type):
        """Creates examples for the training and dev sets."""

-        #There are two types of labels. They should be normalized
+        # There are two types of labels. They should be normalized
        def normalize(truth):
            if truth in "ABCD":
                return ord(truth) - ord("A")
@@ -283,12 +274,18 @@ class ArcProcessor(DataProcessor):
            if len(options) == 4:
                examples.append(
                    InputExample(
-                        example_id = id,
+                        example_id=id,
                        question=question,
-                        contexts=[options[0]["para"].replace("_", ""), options[1]["para"].replace("_", ""),
-                                  options[2]["para"].replace("_", ""), options[3]["para"].replace("_", "")],
+                        contexts=[
+                            options[0]["para"].replace("_", ""),
+                            options[1]["para"].replace("_", ""),
+                            options[2]["para"].replace("_", ""),
+                            options[3]["para"].replace("_", ""),
+                        ],
                        endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]],
-                        label=truth))
+                        label=truth,
+                    )
+                )

        if type == "train":
            assert len(examples) > 1
@@ -316,7 +313,7 @@ def convert_examples_to_features(
    Loads a data file into a list of `InputFeatures`
    """

-    label_map = {label : i for i, label in enumerate(label_list)}
+    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
@@ -331,16 +328,13 @@ def convert_examples_to_features(
            else:
                text_b = example.question + " " + ending

-            inputs = tokenizer.encode_plus(
-                text_a,
-                text_b,
-                add_special_tokens=True,
-                max_length=max_length,
-            )
-            if 'num_truncated_tokens' in inputs and inputs['num_truncated_tokens'] > 0:
-                logger.info('Attention! you are cropping tokens (swag task is ok). '
-                        'If you are training ARC and RACE and you are poping question + options,'
-                        'you need to try to use a bigger max seq length!')
+            inputs = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, max_length=max_length,)
+            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
+                logger.info(
+                    "Attention! you are cropping tokens (swag task is ok). "
+                    "If you are training ARC and RACE and you are poping question + options,"
+                    "you need to try to use a bigger max seq length!"
+                )

            input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

@@ -364,7 +358,6 @@ def convert_examples_to_features(
            assert len(token_type_ids) == max_length
            choices_features.append((input_ids, attention_mask, token_type_ids))

-
        label = label_map[example.label]

        if ex_index < 2:
@@ -372,33 +365,17 @@ def convert_examples_to_features(
            logger.info("race_id: {}".format(example.example_id))
            for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features):
                logger.info("choice: {}".format(choice_idx))
-                logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
-                logger.info("attention_mask: {}".format(' '.join(map(str, attention_mask))))
-                logger.info("token_type_ids: {}".format(' '.join(map(str, token_type_ids))))
+                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
+                logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask))))
+                logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids))))
                logger.info("label: {}".format(label))

-        features.append(
-            InputFeatures(
-                example_id=example.example_id,
-                choices_features=choices_features,
-                label=label,
-            )
-        )
+        features.append(InputFeatures(example_id=example.example_id, choices_features=choices_features, label=label,))

    return features


+processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor}


-processors = {
-    "race": RaceProcessor,
-    "swag": SwagProcessor,
-    "arc": ArcProcessor
-}
-
-
-MULTIPLE_CHOICE_TASKS_NUM_LABELS = {
-    "race", 4,
-    "swag", 4,
-    "arc", 4
-}
+MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4}
--- a/examples/utils_ner.py
+++ b/examples/utils_ner.py
@@ -61,9 +61,7 @@ def read_examples_from_file(data_dir, mode):
        for line in f:
            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                if words:
-                    examples.append(InputExample(guid="{}-{}".format(mode, guid_index),
-                                                 words=words,
-                                                 labels=labels))
+                    examples.append(InputExample(guid="{}-{}".format(mode, guid_index), words=words, labels=labels))
                    guid_index += 1
                    words = []
                    labels = []
@@ -76,27 +74,27 @@ def read_examples_from_file(data_dir, mode):
                    # Examples could have no label for mode = "test"
                    labels.append("O")
        if words:
-            examples.append(InputExample(guid="%s-%d".format(mode, guid_index),
-                                         words=words,
-                                         labels=labels))
+            examples.append(InputExample(guid="%s-%d".format(mode, guid_index), words=words, labels=labels))
    return examples


-def convert_examples_to_features(examples,
-                                 label_list,
-                                 max_seq_length,
-                                 tokenizer,
-                                 cls_token_at_end=False,
-                                 cls_token="[CLS]",
-                                 cls_token_segment_id=1,
-                                 sep_token="[SEP]",
-                                 sep_token_extra=False,
-                                 pad_on_left=False,
-                                 pad_token=0,
-                                 pad_token_segment_id=0,
-                                 pad_token_label_id=-100,
-                                 sequence_a_segment_id=0,
-                                 mask_padding_with_zero=True):
+def convert_examples_to_features(
+    examples,
+    label_list,
+    max_seq_length,
+    tokenizer,
+    cls_token_at_end=False,
+    cls_token="[CLS]",
+    cls_token_segment_id=1,
+    sep_token="[SEP]",
+    sep_token_extra=False,
+    pad_on_left=False,
+    pad_token=0,
+    pad_token_segment_id=0,
+    pad_token_label_id=-100,
+    sequence_a_segment_id=0,
+    mask_padding_with_zero=True,
+):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
@@ -122,8 +120,8 @@ def convert_examples_to_features(examples,
        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
        special_tokens_count = 3 if sep_token_extra else 2
        if len(tokens) > max_seq_length - special_tokens_count:
-            tokens = tokens[:(max_seq_length - special_tokens_count)]
-            label_ids = label_ids[:(max_seq_length - special_tokens_count)]
+            tokens = tokens[: (max_seq_length - special_tokens_count)]
+            label_ids = label_ids[: (max_seq_length - special_tokens_count)]

        # The convention in BERT is:
        # (a) For sequence pairs:
@@ -174,10 +172,10 @@ def convert_examples_to_features(examples,
            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
            label_ids = ([pad_token_label_id] * padding_length) + label_ids
        else:
-            input_ids += ([pad_token] * padding_length)
-            input_mask += ([0 if mask_padding_with_zero else 1] * padding_length)
-            segment_ids += ([pad_token_segment_id] * padding_length)
-            label_ids += ([pad_token_label_id] * padding_length)
+            input_ids += [pad_token] * padding_length
+            input_mask += [0 if mask_padding_with_zero else 1] * padding_length
+            segment_ids += [pad_token_segment_id] * padding_length
+            label_ids += [pad_token_label_id] * padding_length

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
@@ -194,10 +192,8 @@ def convert_examples_to_features(examples,
            logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))

        features.append(
-                InputFeatures(input_ids=input_ids,
-                              input_mask=input_mask,
-                              segment_ids=segment_ids,
-                              label_ids=label_ids))
+            InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids)
+        )
    return features


@@ -209,4 +205,4 @@ def get_labels(path):
            labels = ["O"] + labels
        return labels
    else:
-        return ["O", "B-MISC", "I-MISC",  "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
+        return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]