Black 20 release

2020-08-26 17:20:22 +02:00
parent e78c110338
commit a75c64d80c
191 changed files with 4807 additions and 3503 deletions
--- a/examples/adversarial/utils_hans.py
+++ b/examples/adversarial/utils_hans.py
@@ -112,7 +112,10 @@ if is_torch_available():
            cached_features_file = os.path.join(
                data_dir,
                "cached_{}_{}_{}_{}".format(
-                    "dev" if evaluate else "train", tokenizer.__class__.__name__, str(max_seq_length), task,
+                    "dev" if evaluate else "train",
+                    tokenizer.__class__.__name__,
+                    str(max_seq_length),
+                    task,
                ),
            )
            label_list = processor.get_labels()
@@ -278,7 +281,10 @@ class HansProcessor(DataProcessor):


 def hans_convert_examples_to_features(
-    examples: List[InputExample], label_list: List[str], max_length: int, tokenizer: PreTrainedTokenizer,
+    examples: List[InputExample],
+    label_list: List[str],
+    max_length: int,
+    tokenizer: PreTrainedTokenizer,
 ):
    """
    Loads a data file into a list of ``InputFeatures``
--- a/examples/benchmarking/plot_csv_file.py
+++ b/examples/benchmarking/plot_csv_file.py
@@ -20,7 +20,9 @@ class PlotArguments:
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
    """

-    csv_file: str = field(metadata={"help": "The csv file to plot."},)
+    csv_file: str = field(
+        metadata={"help": "The csv file to plot."},
+    )
    plot_along_batch: bool = field(
        default=False,
        metadata={"help": "Whether to plot along batch size or sequence lengh. Defaults to sequence length."},
@@ -30,7 +32,8 @@ class PlotArguments:
        metadata={"help": "Whether the csv file has time results or memory results. Defaults to memory results."},
    )
    no_log_scale: bool = field(
-        default=False, metadata={"help": "Disable logarithmic scale when plotting"},
+        default=False,
+        metadata={"help": "Disable logarithmic scale when plotting"},
    )
    is_train: bool = field(
        default=False,
@@ -39,7 +42,8 @@ class PlotArguments:
        },
    )
    figure_png_file: Optional[str] = field(
-        default=None, metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
+        default=None,
+        metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
    )
    short_model_names: Optional[List[str]] = list_field(
        default=None, metadata={"help": "List of model names that are used instead of the ones in the csv file."}
--- a/examples/bert-loses-patience/pabee/modeling_pabee_albert.py
+++ b/examples/bert-loses-patience/pabee/modeling_pabee_albert.py
@@ -101,30 +101,30 @@ class AlbertModelWithPabee(AlbertModel):
        regression=False,
    ):
        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during pre-training.
+        Return:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
+            last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+                Sequence of hidden-states at the output of the last layer of the model.
+            pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
+                Last layer hidden-state of the first token of the sequence (classification token)
+                further processed by a Linear layer and a Tanh activation function. The Linear
+                layer weights are trained from the next sentence prediction (classification)
+                objective during pre-training.

-            This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+                This output is usually *not* a good summary
+                of the semantic content of the input, you're often better with averaging or pooling
+                the sequence of hidden-states for the whole input sequence.
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
        """

        if input_ids is not None and inputs_embeds is not None:
@@ -157,7 +157,10 @@ class AlbertModelWithPabee(AlbertModel):
            res = []
            for i in range(self.config.num_hidden_layers):
                encoder_outputs = self.encoder.adaptive_forward(
-                    encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask,
+                    encoder_outputs,
+                    current_layer=i,
+                    attention_mask=extended_attention_mask,
+                    head_mask=head_mask,
                )

                pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
@@ -174,7 +177,10 @@ class AlbertModelWithPabee(AlbertModel):
            for i in range(self.config.num_hidden_layers):
                calculated_layer_num += 1
                encoder_outputs = self.encoder.adaptive_forward(
-                    encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask,
+                    encoder_outputs,
+                    current_layer=i,
+                    attention_mask=extended_attention_mask,
+                    head_mask=head_mask,
                )

                pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
@@ -236,42 +242,42 @@ class AlbertForSequenceClassificationWithPabee(AlbertPreTrainedModel):
        labels=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
-            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
-            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in ``[0, ..., config.num_labels - 1]``.
+                If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+                If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).

-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
-        loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Classification (or regression if config.num_labels==1) loss.
-        logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
+            loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+                Classification (or regression if config.num_labels==1) loss.
+            logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+                Classification (or regression if config.num_labels==1) scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.

-        Examples::
+            Examples::

-            from transformers import AlbertTokenizer
-            from pabee import AlbertForSequenceClassificationWithPabee
-            import torch
+                from transformers import AlbertTokenizer
+                from pabee import AlbertForSequenceClassificationWithPabee
+                import torch

-            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-            model = AlbertForSequenceClassificationWithPabee.from_pretrained('albert-base-v2')
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-            labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=labels)
-            loss, logits = outputs[:2]
+                tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+                model = AlbertForSequenceClassificationWithPabee.from_pretrained('albert-base-v2')
+                input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+                labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+                outputs = model(input_ids, labels=labels)
+                loss, logits = outputs[:2]

        """

--- a/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
+++ b/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
@@ -108,30 +108,30 @@ class BertModelWithPabee(BertModel):
        regression=False,
    ):
        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during pre-training.
+        Return:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+            last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+                Sequence of hidden-states at the output of the last layer of the model.
+            pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
+                Last layer hidden-state of the first token of the sequence (classification token)
+                further processed by a Linear layer and a Tanh activation function. The Linear
+                layer weights are trained from the next sentence prediction (classification)
+                objective during pre-training.

-            This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+                This output is usually *not* a good summary
+                of the semantic content of the input, you're often better with averaging or pooling
+                the sequence of hidden-states for the whole input sequence.
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
        """

        if input_ids is not None and inputs_embeds is not None:
@@ -266,44 +266,44 @@ class BertForSequenceClassificationWithPabee(BertPreTrainedModel):
        labels=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+                Classification (or regression if config.num_labels==1) loss.
+            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+                Classification (or regression if config.num_labels==1) scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.

-    Examples::
+        Examples::

-        from transformers import BertTokenizer, BertForSequenceClassification
-        from pabee import BertForSequenceClassificationWithPabee
-        import torch
+            from transformers import BertTokenizer, BertForSequenceClassification
+            from pabee import BertForSequenceClassificationWithPabee
+            import torch

-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForSequenceClassificationWithPabee.from_pretrained('bert-base-uncased')
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            model = BertForSequenceClassificationWithPabee.from_pretrained('bert-base-uncased')

-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
+            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+            outputs = model(input_ids, labels=labels)

-        loss, logits = outputs[:2]
+            loss, logits = outputs[:2]

        """

--- a/examples/bert-loses-patience/run_glue_with_pabee.py
+++ b/examples/bert-loses-patience/run_glue_with_pabee.py
@@ -120,7 +120,10 @@ def train(args, train_dataset, model, tokenizer):
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
+            model,
+            device_ids=[args.local_rank],
+            output_device=args.local_rank,
+            find_unused_parameters=True,
        )

    # Train!
@@ -151,13 +154,17 @@ def train(args, train_dataset, model, tokenizer):
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
        logger.info(
-            "  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch,
+            "  Will skip the first %d steps in the first epoch",
+            steps_trained_in_current_epoch,
        )

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
-        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
+        epochs_trained,
+        int(args.num_train_epochs),
+        desc="Epoch",
+        disable=args.local_rank not in [-1, 0],
    )
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
@@ -372,7 +379,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        )
        features = convert_examples_to_features(
-            examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode,
+            examples,
+            tokenizer,
+            label_list=label_list,
+            max_length=args.max_seq_length,
+            output_mode=output_mode,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@@ -434,15 +445,24 @@ def main():
        help="The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument(
-        "--patience", default="0", type=str, required=False,
+        "--patience",
+        default="0",
+        type=str,
+        required=False,
    )
    parser.add_argument(
-        "--regression_threshold", default=0, type=float, required=False,
+        "--regression_threshold",
+        default=0,
+        type=float,
+        required=False,
    )

    # Other parameters
    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
+        "--config_name",
+        default="",
+        type=str,
+        help="Pretrained config name or path if not the same as model_name",
    )
    parser.add_argument(
        "--tokenizer_name",
@@ -466,17 +486,27 @@ def main():
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.",
+        "--evaluate_during_training",
+        action="store_true",
+        help="Run evaluation during training at each logging step.",
    )
    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",
+        "--do_lower_case",
+        action="store_true",
+        help="Set this flag if you are using an uncased model.",
    )

    parser.add_argument(
-        "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
+        "--per_gpu_train_batch_size",
+        default=8,
+        type=int,
+        help="Batch size per GPU/CPU for training.",
    )
    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=1, type=int, help="Batch size per GPU/CPU for evaluation.",
+        "--per_gpu_eval_batch_size",
+        default=1,
+        type=int,
+        help="Batch size per GPU/CPU for evaluation.",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
@@ -485,13 +515,19 @@ def main():
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument(
-        "--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.",
+        "--learning_rate",
+        default=5e-5,
+        type=float,
+        help="The initial learning rate for Adam.",
    )
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
+        "--num_train_epochs",
+        default=3.0,
+        type=float,
+        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--max_steps",
@@ -503,7 +539,10 @@ def main():

    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
    parser.add_argument(
-        "--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.",
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save checkpoint every X updates steps.",
    )
    parser.add_argument(
        "--eval_all_checkpoints",
@@ -512,10 +551,14 @@ def main():
    )
    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory",
+        "--overwrite_output_dir",
+        action="store_true",
+        help="Overwrite the content of the output directory",
    )
    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets",
+        "--overwrite_cache",
+        action="store_true",
+        help="Overwrite the cached training and evaluation sets",
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

@@ -532,7 +575,10 @@ def main():
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument(
-        "--local_rank", type=int, default=-1, help="For distributed training: local_rank",
+        "--local_rank",
+        type=int,
+        default=-1,
+        help="For distributed training: local_rank",
    )
    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
@@ -634,7 +680,8 @@ def main():
    print("Output Layers Parameters:", output_layers_param_num)
    single_output_layer_param_num = sum(param.numel() for param in model.classifiers[0].parameters())
    print(
-        "Added Output Layers Parameters:", output_layers_param_num - single_output_layer_param_num,
+        "Added Output Layers Parameters:",
+        output_layers_param_num - single_output_layer_param_num,
    )

    logger.info("Training/evaluation parameters %s", args)
--- a/examples/bertology/run_bertology.py
+++ b/examples/bertology/run_bertology.py
@@ -66,9 +66,9 @@ def print_2d_tensor(tensor):
 def compute_heads_importance(
    args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None, actually_pruned=False
 ):
-    """ This method shows how to compute:
-        - head attention entropy
-        - head importance scores according to http://arxiv.org/abs/1905.10650
+    """This method shows how to compute:
+    - head attention entropy
+    - head importance scores according to http://arxiv.org/abs/1905.10650
    """
    # Prepare our tensors
    n_layers, n_heads = model.config.num_hidden_layers, model.config.num_attention_heads
@@ -150,8 +150,8 @@ def compute_heads_importance(


 def mask_heads(args, model, eval_dataloader):
-    """ This method shows how to mask head (set some heads to zero), to test the effect on the network,
-        based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
+    """This method shows how to mask head (set some heads to zero), to test the effect on the network,
+    based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
    """
    _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
@@ -201,8 +201,8 @@ def mask_heads(args, model, eval_dataloader):


 def prune_heads(args, model, eval_dataloader, head_mask):
-    """ This method shows how to prune head (remove heads weights) based on
-        the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
+    """This method shows how to prune head (remove heads weights) based on
+    the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
    """
    # Try pruning and test time speedup
    # Pruning is like masking but we actually remove the masked weights
@@ -395,7 +395,8 @@ def main():
        cache_dir=args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir,
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        cache_dir=args.cache_dir,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path,
--- a/examples/contrib/mm-imdb/utils_mmimdb.py
+++ b/examples/contrib/mm-imdb/utils_mmimdb.py
@@ -138,6 +138,9 @@ def get_image_transforms():
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
-            transforms.Normalize(mean=[0.46777044, 0.44531429, 0.40661017], std=[0.12221994, 0.12145835, 0.14380469],),
+            transforms.Normalize(
+                mean=[0.46777044, 0.44531429, 0.40661017],
+                std=[0.12221994, 0.12145835, 0.14380469],
+            ),
        ]
    )
--- a/examples/contrib/run_camembert.py
+++ b/examples/contrib/run_camembert.py
@@ -30,7 +30,11 @@ def fill_mask(masked_input, model, tokenizer, topk=5):
            )
        else:
            topk_filled_outputs.append(
-                (masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
+                (
+                    masked_input.replace(masked_token, predicted_token),
+                    values[index].item(),
+                    predicted_token,
+                )
            )
    return topk_filled_outputs

--- a/examples/contrib/run_openai_gpt.py
+++ b/examples/contrib/run_openai_gpt.py
@@ -71,10 +71,10 @@ def load_rocstories_dataset(dataset_path):


 def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
-    """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
+    """Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)

-        To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
-        input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
+    To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
+    input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
    """
    tensor_datasets = []
    for dataset in encoded_datasets:
@@ -83,7 +83,10 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
        mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
        lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64)
        mc_labels = np.zeros((n_batch,), dtype=np.int64)
-        for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
+        for (
+            i,
+            (story, cont1, cont2, mc_label),
+        ) in enumerate(dataset):
            with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
            with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
            input_ids[i, 0, : len(with_cont1)] = with_cont1
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -629,7 +629,9 @@ def main():
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,)
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+    )
    model = AutoModelForMultipleChoice.from_pretrained(
        args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
    )
--- a/examples/deebert/run_glue_deebert.py
+++ b/examples/deebert/run_glue_deebert.py
@@ -358,7 +358,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        )
        features = convert_examples_to_features(
-            examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode,
+            examples,
+            tokenizer,
+            label_list=label_list,
+            max_length=args.max_seq_length,
+            output_mode=output_mode,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
--- a/examples/deebert/src/modeling_highway_bert.py
+++ b/examples/deebert/src/modeling_highway_bert.py
@@ -14,8 +14,7 @@ from transformers.modeling_bert import (


 def entropy(x):
-    """ Calculate entropy of a pre-softmax logit Tensor
-    """
+    """Calculate entropy of a pre-softmax logit Tensor"""
    exp_x = torch.exp(x)
    A = torch.sum(exp_x, dim=1)  # sum of exp(x_i)
    B = torch.sum(x * exp_x, dim=1)  # sum of x_i * exp(x_i)
@@ -104,7 +103,8 @@ class DeeBertEncoder(nn.Module):


@add_start_docstrings(
-    "The Bert Model transformer with early exiting (DeeBERT). ", BERT_START_DOCSTRING,
+    "The Bert Model transformer with early exiting (DeeBERT). ",
+    BERT_START_DOCSTRING,
 )
 class DeeBertModel(BertPreTrainedModel):
    def __init__(self, config):
@@ -127,9 +127,9 @@ class DeeBertModel(BertPreTrainedModel):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
@@ -147,33 +147,33 @@ class DeeBertModel(BertPreTrainedModel):
        encoder_attention_mask=None,
    ):
        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during pre-training.
+        Return:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+            last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+                Sequence of hidden-states at the output of the last layer of the model.
+            pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
+                Last layer hidden-state of the first token of the sequence (classification token)
+                further processed by a Linear layer and a Tanh activation function. The Linear
+                layer weights are trained from the next sentence prediction (classification)
+                objective during pre-training.

-            This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+                This output is usually *not* a good summary
+                of the semantic content of the input, you're often better with averaging or pooling
+                the sequence of hidden-states for the whole input sequence.
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
-            Tuple of each early exit's results (total length: number of layers)
-            Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
+            highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
+                Tuple of each early exit's results (total length: number of layers)
+                Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
        """
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -302,32 +302,32 @@ class DeeBertForSequenceClassification(BertPreTrainedModel):
        train_highway=False,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+                Classification (or regression if config.num_labels==1) loss.
+            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+                Classification (or regression if config.num_labels==1) scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
-            Tuple of each early exit's results (total length: number of layers)
-            Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
+            highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
+                Tuple of each early exit's results (total length: number of layers)
+                Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
        """

        exit_layer = self.num_layers
--- a/examples/deebert/src/modeling_highway_roberta.py
+++ b/examples/deebert/src/modeling_highway_roberta.py
@@ -11,7 +11,8 @@ from .modeling_highway_bert import BertPreTrainedModel, DeeBertModel, HighwayExc


@add_start_docstrings(
-    "The RoBERTa Model transformer with early exiting (DeeRoBERTa). ", ROBERTA_START_DOCSTRING,
+    "The RoBERTa Model transformer with early exiting (DeeRoBERTa). ",
+    ROBERTA_START_DOCSTRING,
 )
 class DeeRobertaModel(DeeBertModel):

@@ -58,32 +59,32 @@ class DeeRobertaForSequenceClassification(BertPreTrainedModel):
        train_highway=False,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+                Classification (or regression if config.num_labels==1) loss.
+            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+                Classification (or regression if config.num_labels==1) scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
-            Tuple of each early exit's results (total length: number of layers)
-            Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
+            highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
+                Tuple of each early exit's results (total length: number of layers)
+                Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
        """

        exit_layer = self.num_layers
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@@ -228,14 +228,20 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                assert end_logits_tea.size() == end_logits_stu.size()

                loss_fct = nn.KLDivLoss(reduction="batchmean")
-                loss_start = loss_fct(
-                    F.log_softmax(start_logits_stu / args.temperature, dim=-1),
-                    F.softmax(start_logits_tea / args.temperature, dim=-1),
-                ) * (args.temperature ** 2)
-                loss_end = loss_fct(
-                    F.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                    F.softmax(end_logits_tea / args.temperature, dim=-1),
-                ) * (args.temperature ** 2)
+                loss_start = (
+                    loss_fct(
+                        F.log_softmax(start_logits_stu / args.temperature, dim=-1),
+                        F.softmax(start_logits_tea / args.temperature, dim=-1),
+                    )
+                    * (args.temperature ** 2)
+                )
+                loss_end = (
+                    loss_fct(
+                        F.log_softmax(end_logits_stu / args.temperature, dim=-1),
+                        F.softmax(end_logits_tea / args.temperature, dim=-1),
+                    )
+                    * (args.temperature ** 2)
+                )
                loss_ce = (loss_start + loss_end) / 2.0

                loss = args.alpha_ce * loss_ce + args.alpha_squad * loss
--- a/examples/distillation/utils.py
+++ b/examples/distillation/utils.py
@@ -118,7 +118,8 @@ def init_gpu_params(params):
    if params.multi_gpu:
        logger.info("Initializing PyTorch distributed")
        torch.distributed.init_process_group(
-            init_method="env://", backend="nccl",
+            init_method="env://",
+            backend="nccl",
        )


--- a/examples/language-modeling/run_language_modeling.py
+++ b/examples/language-modeling/run_language_modeling.py
@@ -233,7 +233,9 @@ def main():
    eval_dataset = get_dataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None
    if config.model_type == "xlnet":
        data_collator = DataCollatorForPermutationLanguageModeling(
-            tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length,
+            tokenizer=tokenizer,
+            plm_probability=data_args.plm_probability,
+            max_span_length=data_args.max_span_length,
        )
    else:
        data_collator = DataCollatorForLanguageModeling(
--- a/examples/lightning_base.py
+++ b/examples/lightning_base.py
@@ -226,10 +226,14 @@ class BaseTransformer(pl.LightningModule):
            help="Decoder layer dropout probability (Optional). Goes into model.config",
        )
        parser.add_argument(
-            "--dropout", type=float, help="Dropout probability (Optional). Goes into model.config",
+            "--dropout",
+            type=float,
+            help="Dropout probability (Optional). Goes into model.config",
        )
        parser.add_argument(
-            "--attention_dropout", type=float, help="Attention dropout probability (Optional). Goes into model.config",
+            "--attention_dropout",
+            type=float,
+            help="Attention dropout probability (Optional). Goes into model.config",
        )
        parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
        parser.add_argument(
--- a/examples/longform-qa/eli5_app.py
+++ b/examples/longform-qa/eli5_app.py
@@ -95,7 +95,10 @@ def make_support(question, source="wiki40b", method="dense", n_results=10):
            )
        else:
            support_doc, hit_lst = query_es_index(
-                question, es_client, index_name="english_wiki40b_snippets_100w", n_results=n_results,
+                question,
+                es_client,
+                index_name="english_wiki40b_snippets_100w",
+                n_results=n_results,
            )
    support_list = [
        (res["article_title"], res["section_title"].strip(), res["score"], res["passage_text"]) for res in hit_lst
@@ -154,7 +157,8 @@ header_full = """
    header_html,
 )
 st.sidebar.markdown(
-    header_full, unsafe_allow_html=True,
+    header_full,
+    unsafe_allow_html=True,
 )

 # Long Form QA with ELI5 and Wikipedia
@@ -173,9 +177,17 @@ action_list = [
 ]
 demo_options = st.sidebar.checkbox("Demo options")
 if demo_options:
-    action_st = st.sidebar.selectbox("", action_list, index=3,)
+    action_st = st.sidebar.selectbox(
+        "",
+        action_list,
+        index=3,
+    )
    action = action_list.index(action_st)
-    show_type = st.sidebar.selectbox("", ["Show full text of passages", "Show passage section titles"], index=0,)
+    show_type = st.sidebar.selectbox(
+        "",
+        ["Show full text of passages", "Show passage section titles"],
+        index=0,
+    )
    show_passages = show_type == "Show full text of passages"
 else:
    action = 3
@@ -250,7 +262,9 @@ questions_list = [
    "How does New Zealand have so many large bird predators?",
 ]
 question_s = st.selectbox(
-    "What would you like to ask? ---- select <MY QUESTION> to enter a new query", questions_list, index=1,
+    "What would you like to ask? ---- select <MY QUESTION> to enter a new query",
+    questions_list,
+    index=1,
 )
 if question_s == "<MY QUESTION>":
    question = st.text_input("Enter your question here:", "")
--- a/examples/longform-qa/eli5_utils.py
+++ b/examples/longform-qa/eli5_utils.py
@@ -48,7 +48,11 @@ def make_es_index_snippets(es_client, passages_dset, index_name="english_wiki_ki
            yield passage

    # create the ES index
-    for ok, action in streaming_bulk(client=es_client, index=index_name, actions=passage_generator(),):
+    for ok, action in streaming_bulk(
+        client=es_client,
+        index=index_name,
+        actions=passage_generator(),
+    ):
        progress.update(1)
        successes += ok
    print("Indexed %d documents" % (successes,))
@@ -137,7 +141,11 @@ class RetrievalQAEmbedder(torch.nn.Module):

            # define function for checkpointing
            def partial_encode(*inputs):
-                encoder_outputs = self.sent_encoder.encoder(inputs[0], attention_mask=inputs[1], head_mask=head_mask,)
+                encoder_outputs = self.sent_encoder.encoder(
+                    inputs[0],
+                    attention_mask=inputs[1],
+                    head_mask=head_mask,
+                )
                sequence_output = encoder_outputs[0]
                pooled_output = self.sent_encoder.pooler(sequence_output)
                return pooled_output
@@ -234,7 +242,11 @@ def train_qa_retriever_epoch(model, dataset, tokenizer, optimizer, scheduler, ar
        if step % args.print_freq == 0 or step == 1:
            print(
                "{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
-                    e, step, len(dataset) // args.batch_size, loc_loss / loc_steps, time() - st_time,
+                    e,
+                    step,
+                    len(dataset) // args.batch_size,
+                    loc_loss / loc_steps,
+                    time() - st_time,
                )
            )
            loc_loss = 0
@@ -273,7 +285,11 @@ def train_qa_retriever_joint_epoch(model, dataset_list, tokenizer, optimizer, sc
        if step % args.print_freq == 0:
            print(
                "{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
-                    e, step, len(dataset_list[0]) // args.batch_size, loc_loss / loc_steps, time() - st_time,
+                    e,
+                    step,
+                    len(dataset_list[0]) // args.batch_size,
+                    loc_loss / loc_steps,
+                    time() - st_time,
                )
            )
            loc_loss = 0
@@ -354,7 +370,8 @@ class ELI5DatasetS2S(Dataset):
            self.document_cache[q_id] = self.document_cache.get(q_id, self.make_doc_function(example["title"]))
        document = self.document_cache[q_id]
        in_st = "question: {} context: {}".format(
-            question.lower().replace(" --t--", "").strip(), document.lower().strip(),
+            question.lower().replace(" --t--", "").strip(),
+            document.lower().strip(),
        )
        out_st = answer
        return (in_st, out_st)
@@ -427,7 +444,11 @@ def train_qa_s2s_epoch(model, dataset, tokenizer, optimizer, scheduler, args, e=
        if step % args.print_freq == 0 or step == 1:
            print(
                "{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
-                    e, step, len(dataset) // args.batch_size, loc_loss / loc_steps, time() - st_time,
+                    e,
+                    step,
+                    len(dataset) // args.batch_size,
+                    loc_loss / loc_steps,
+                    time() - st_time,
                )
            )
            loc_loss = 0
@@ -456,10 +477,18 @@ def eval_qa_s2s_epoch(model, dataset, tokenizer, args):
            if step % args.print_freq == 0:
                print(
                    "{:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
-                        step, len(dataset) // args.batch_size, loc_loss / loc_steps, time() - st_time,
+                        step,
+                        len(dataset) // args.batch_size,
+                        loc_loss / loc_steps,
+                        time() - st_time,
                    )
                )
-    print("Total \t L: {:.3f} \t -- {:.3f}".format(loc_loss / loc_steps, time() - st_time,))
+    print(
+        "Total \t L: {:.3f} \t -- {:.3f}".format(
+            loc_loss / loc_steps,
+            time() - st_time,
+        )
+    )


 def train_qa_s2s(qa_s2s_model, qa_s2s_tokenizer, s2s_train_dset, s2s_valid_dset, s2s_args):
@@ -506,7 +535,12 @@ def qa_s2s_generate(
    max_input_length=512,
    device="cuda:0",
 ):
-    model_inputs = make_qa_s2s_batch([(question_doc, "A")], qa_s2s_tokenizer, max_input_length, device=device,)
+    model_inputs = make_qa_s2s_batch(
+        [(question_doc, "A")],
+        qa_s2s_tokenizer,
+        max_input_length,
+        device=device,
+    )
    n_beams = num_answers if num_beams is None else max(num_beams, num_answers)
    generated_ids = qa_s2s_model.generate(
        input_ids=model_inputs["input_ids"],
--- a/examples/movement-pruning/emmental/modeling_bert_masked.py
+++ b/examples/movement-pruning/emmental/modeling_bert_masked.py
@@ -37,8 +37,7 @@ logger = logging.getLogger(__name__)


 class BertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
+    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config):
        super().__init__()
@@ -385,8 +384,8 @@ class BertPooler(nn.Module):


 class MaskedBertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = MaskedBertConfig
@@ -492,9 +491,9 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
@@ -685,31 +684,31 @@ class MaskedBertForSequenceClassification(MaskedBertPreTrainedModel):
        threshold=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        threshold (:obj:`float`):
-            Threshold value (see :class:`~emmental.MaskedLinear`).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            threshold (:obj:`float`):
+                Threshold value (see :class:`~emmental.MaskedLinear`).

-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+                Classification (or regression if config.num_labels==1) loss.
+            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+                Classification (or regression if config.num_labels==1) scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
        """

        outputs = self.bert(
@@ -770,32 +769,32 @@ class MaskedBertForMultipleChoice(MaskedBertPreTrainedModel):
        threshold=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
-        threshold (:obj:`float`):
-            Threshold value (see :class:`~emmental.MaskedLinear`).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the multiple choice classification loss.
+                Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+                of the input tensors. (see `input_ids` above)
+            threshold (:obj:`float`):
+                Threshold value (see :class:`~emmental.MaskedLinear`).

-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Classification loss.
-        classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
-            `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
+                Classification loss.
+            classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
+                `num_choices` is the second dimension of the input tensors. (see `input_ids` above).

-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+                Classification scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.

        """
        num_choices = input_ids.shape[1]
@@ -860,29 +859,29 @@ class MaskedBertForTokenClassification(MaskedBertPreTrainedModel):
        threshold=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the token classification loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
-        threshold (:obj:`float`):
-            Threshold value (see :class:`~emmental.MaskedLinear`).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the token classification loss.
+                Indices should be in ``[0, ..., config.num_labels - 1]``.
+            threshold (:obj:`float`):
+                Threshold value (see :class:`~emmental.MaskedLinear`).

-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
-            Classification loss.
-        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
+                Classification loss.
+            scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
+                Classification scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
        """

        outputs = self.bert(
@@ -947,36 +946,36 @@ class MaskedBertForQuestionAnswering(MaskedBertPreTrainedModel):
        threshold=None,
    ):
        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        threshold (:obj:`float`):
-            Threshold value (see :class:`~emmental.MaskedLinear`).
+            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`).
+                Position outside of the sequence are not taken into account for computing the loss.
+            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`).
+                Position outside of the sequence are not taken into account for computing the loss.
+            threshold (:obj:`float`):
+                Threshold value (see :class:`~emmental.MaskedLinear`).

-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-start scores (before SoftMax).
-        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+                Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+            start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
+                Span-start scores (before SoftMax).
+            end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
+                Span-end scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
        """

        outputs = self.bert(
@@ -996,7 +995,10 @@ class MaskedBertForQuestionAnswering(MaskedBertPreTrainedModel):
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

-        outputs = (start_logits, end_logits,) + outputs[2:]
+        outputs = (
+            start_logits,
+            end_logits,
+        ) + outputs[2:]
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
--- a/examples/movement-pruning/masked_run_glue.py
+++ b/examples/movement-pruning/masked_run_glue.py
@@ -173,7 +173,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
+            model,
+            device_ids=[args.local_rank],
+            output_device=args.local_rank,
+            find_unused_parameters=True,
        )

    # Train!
@@ -217,7 +220,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
-        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
+        epochs_trained,
+        int(args.num_train_epochs),
+        desc="Epoch",
+        disable=args.local_rank not in [-1, 0],
    )
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
@@ -280,11 +286,14 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                        attention_mask=inputs["attention_mask"],
                    )

-                loss_logits = F.kl_div(
-                    input=F.log_softmax(logits_stu / args.temperature, dim=-1),
-                    target=F.softmax(logits_tea / args.temperature, dim=-1),
-                    reduction="batchmean",
-                ) * (args.temperature ** 2)
+                loss_logits = (
+                    F.kl_div(
+                        input=F.log_softmax(logits_stu / args.temperature, dim=-1),
+                        target=F.softmax(logits_tea / args.temperature, dim=-1),
+                        reduction="batchmean",
+                    )
+                    * (args.temperature ** 2)
+                )

                loss = args.alpha_distil * loss_logits + args.alpha_ce * loss

@@ -529,7 +538,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        )
        features = convert_examples_to_features(
-            examples, tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode=output_mode,
+            examples,
+            tokenizer,
+            max_length=args.max_seq_length,
+            label_list=label_list,
+            output_mode=output_mode,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@@ -592,7 +605,10 @@ def main():
    )
    # Other parameters
    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
+        "--config_name",
+        default="",
+        type=str,
+        help="Pretrained config name or path if not the same as model_name",
    )
    parser.add_argument(
        "--tokenizer_name",
@@ -616,17 +632,27 @@ def main():
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.",
+        "--evaluate_during_training",
+        action="store_true",
+        help="Run evaluation during training at each logging step.",
    )
    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",
+        "--do_lower_case",
+        action="store_true",
+        help="Set this flag if you are using an uncased model.",
    )

    parser.add_argument(
-        "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
+        "--per_gpu_train_batch_size",
+        default=8,
+        type=int,
+        help="Batch size per GPU/CPU for training.",
    )
    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.",
+        "--per_gpu_eval_batch_size",
+        default=8,
+        type=int,
+        help="Batch size per GPU/CPU for evaluation.",
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")

@@ -723,7 +749,10 @@ def main():
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
+        "--num_train_epochs",
+        default=3.0,
+        type=float,
+        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--max_steps",
@@ -742,10 +771,14 @@ def main():
    )
    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory",
+        "--overwrite_output_dir",
+        action="store_true",
+        help="Overwrite the content of the output directory",
    )
    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets",
+        "--overwrite_cache",
+        action="store_true",
+        help="Overwrite the cached training and evaluation sets",
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

--- a/examples/movement-pruning/masked_run_squad.py
+++ b/examples/movement-pruning/masked_run_squad.py
@@ -181,7 +181,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
+            model,
+            device_ids=[args.local_rank],
+            output_device=args.local_rank,
+            find_unused_parameters=True,
        )

    # Train!
@@ -304,16 +307,22 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                        attention_mask=inputs["attention_mask"],
                    )

-                loss_start = F.kl_div(
-                    input=F.log_softmax(start_logits_stu / args.temperature, dim=-1),
-                    target=F.softmax(start_logits_tea / args.temperature, dim=-1),
-                    reduction="batchmean",
-                ) * (args.temperature ** 2)
-                loss_end = F.kl_div(
-                    input=F.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                    target=F.softmax(end_logits_tea / args.temperature, dim=-1),
-                    reduction="batchmean",
-                ) * (args.temperature ** 2)
+                loss_start = (
+                    F.kl_div(
+                        input=F.log_softmax(start_logits_stu / args.temperature, dim=-1),
+                        target=F.softmax(start_logits_tea / args.temperature, dim=-1),
+                        reduction="batchmean",
+                    )
+                    * (args.temperature ** 2)
+                )
+                loss_end = (
+                    F.kl_div(
+                        input=F.log_softmax(end_logits_stu / args.temperature, dim=-1),
+                        target=F.softmax(end_logits_tea / args.temperature, dim=-1),
+                        reduction="batchmean",
+                    )
+                    * (args.temperature ** 2)
+                )
                loss_logits = (loss_start + loss_end) / 2.0

                loss = args.alpha_distil * loss_logits + args.alpha_ce * loss
@@ -859,7 +868,10 @@ def main():
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
+        "--num_train_epochs",
+        default=3.0,
+        type=float,
+        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--max_steps",
--- a/examples/multiple-choice/utils_multiple_choice.py
+++ b/examples/multiple-choice/utils_multiple_choice.py
@@ -100,7 +100,12 @@ if is_torch_available():

            cached_features_file = os.path.join(
                data_dir,
-                "cached_{}_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length), task,),
+                "cached_{}_{}_{}_{}".format(
+                    mode.value,
+                    tokenizer.__class__.__name__,
+                    str(max_seq_length),
+                    task,
+                ),
            )

            # Make sure only the first process in distributed training processes the dataset,
@@ -121,7 +126,12 @@ if is_torch_available():
                    else:
                        examples = processor.get_train_examples(data_dir)
                    logger.info("Training examples: %s", len(examples))
-                    self.features = convert_examples_to_features(examples, label_list, max_seq_length, tokenizer,)
+                    self.features = convert_examples_to_features(
+                        examples,
+                        label_list,
+                        max_seq_length,
+                        tokenizer,
+                    )
                    logger.info("Saving features into cached file %s", cached_features_file)
                    torch.save(self.features, cached_features_file)

@@ -164,7 +174,12 @@ if is_tf_available():
                examples = processor.get_train_examples(data_dir)
            logger.info("Training examples: %s", len(examples))

-            self.features = convert_examples_to_features(examples, label_list, max_seq_length, tokenizer,)
+            self.features = convert_examples_to_features(
+                examples,
+                label_list,
+                max_seq_length,
+                tokenizer,
+            )

            def gen():
                for (ex_index, ex) in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"):
@@ -491,7 +506,10 @@ class ArcProcessor(DataProcessor):


 def convert_examples_to_features(
-    examples: List[InputExample], label_list: List[str], max_length: int, tokenizer: PreTrainedTokenizer,
+    examples: List[InputExample],
+    label_list: List[str],
+    max_length: int,
+    tokenizer: PreTrainedTokenizer,
 ) -> List[InputFeatures]:
    """
    Loads a data file into a list of `InputFeatures`
--- a/examples/question-answering/run_squad_trainer.py
+++ b/examples/question-answering/run_squad_trainer.py
@@ -137,7 +137,12 @@ def main():
    )

    # Initialize our Trainer
-    trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,)
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+    )

    # Training
    if training_args.do_train:
--- a/examples/question-answering/run_tf_squad.py
+++ b/examples/question-answering/run_tf_squad.py
@@ -231,7 +231,12 @@ def main():
    eval_dataset = eval_dataset.apply(tf.data.experimental.assert_cardinality(len(eval_examples)))

    # Initialize our Trainer
-    trainer = TFTrainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,)
+    trainer = TFTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+    )

    # Training
    if training_args.do_train:
--- a/examples/seq2seq/bertabs/configuration_bertabs.py
+++ b/examples/seq2seq/bertabs/configuration_bertabs.py
@@ -28,7 +28,7 @@ BERTABS_FINETUNED_CONFIG_MAP = {


 class BertAbsConfig(PretrainedConfig):
-    r""" Class to store the configuration of the BertAbs model.
+    r"""Class to store the configuration of the BertAbs model.

    Arguments:
        vocab_size: int
--- a/examples/seq2seq/bertabs/convert_bertabs_original_pytorch_checkpoint.py
+++ b/examples/seq2seq/bertabs/convert_bertabs_original_pytorch_checkpoint.py
@@ -62,7 +62,7 @@ BertAbsConfig = namedtuple(


 def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
-    """ Copy/paste and tweak the pre-trained weights provided by the creators
+    """Copy/paste and tweak the pre-trained weights provided by the creators
    of BertAbs for the internal architecture.
    """

@@ -164,13 +164,22 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
-        "--bertabs_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump.",
+        "--bertabs_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path the official PyTorch dump.",
    )
    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model.",
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the output PyTorch model.",
    )
    args = parser.parse_args()

    convert_bertabs_checkpoints(
-        args.bertabs_checkpoint_path, args.pytorch_dump_folder_path,
+        args.bertabs_checkpoint_path,
+        args.pytorch_dump_folder_path,
    )
--- a/examples/seq2seq/bertabs/modeling_bertabs.py
+++ b/examples/seq2seq/bertabs/modeling_bertabs.py
@@ -105,10 +105,17 @@ class BertAbs(BertAbsPreTrainedModel):
                p.data.zero_()

    def forward(
-        self, encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask,
+        self,
+        encoder_input_ids,
+        decoder_input_ids,
+        token_type_ids,
+        encoder_attention_mask,
+        decoder_attention_mask,
    ):
        encoder_output = self.bert(
-            input_ids=encoder_input_ids, token_type_ids=token_type_ids, attention_mask=encoder_attention_mask,
+            input_ids=encoder_input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=encoder_attention_mask,
        )
        encoder_hidden_states = encoder_output[0]
        dec_state = self.decoder.init_decoder_state(encoder_input_ids, encoder_hidden_states)
@@ -117,8 +124,7 @@ class BertAbs(BertAbsPreTrainedModel):


 class Bert(nn.Module):
-    """ This class is not really necessary and should probably disappear.
-    """
+    """This class is not really necessary and should probably disappear."""

    def __init__(self):
        super().__init__()
@@ -307,7 +313,14 @@ class TransformerDecoderLayer(nn.Module):
        self.register_buffer("mask", mask)

    def forward(
-        self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, previous_input=None, layer_cache=None, step=None,
+        self,
+        inputs,
+        memory_bank,
+        src_pad_mask,
+        tgt_pad_mask,
+        previous_input=None,
+        layer_cache=None,
+        step=None,
    ):
        """
        Args:
@@ -331,13 +344,25 @@ class TransformerDecoderLayer(nn.Module):
            all_input = torch.cat((previous_input, input_norm), dim=1)
            dec_mask = None

-        query = self.self_attn(all_input, all_input, input_norm, mask=dec_mask, layer_cache=layer_cache, type="self",)
+        query = self.self_attn(
+            all_input,
+            all_input,
+            input_norm,
+            mask=dec_mask,
+            layer_cache=layer_cache,
+            type="self",
+        )

        query = self.drop(query) + inputs

        query_norm = self.layer_norm_2(query)
        mid = self.context_attn(
-            memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, type="context",
+            memory_bank,
+            memory_bank,
+            query_norm,
+            mask=src_pad_mask,
+            layer_cache=layer_cache,
+            type="context",
        )
        output = self.feed_forward(self.drop(mid) + query)

@@ -422,7 +447,14 @@ class MultiHeadedAttention(nn.Module):
            self.final_linear = nn.Linear(model_dim, model_dim)

    def forward(
-        self, key, value, query, mask=None, layer_cache=None, type=None, predefined_graph_1=None,
+        self,
+        key,
+        value,
+        query,
+        mask=None,
+        layer_cache=None,
+        type=None,
+        predefined_graph_1=None,
    ):
        """
        Compute the context vector and the attention vectors.
@@ -628,7 +660,7 @@ def gelu(x):


 class PositionwiseFeedForward(nn.Module):
-    """ A two-layer Feed-Forward-Network with residual layer norm.
+    """A two-layer Feed-Forward-Network with residual layer norm.

    Args:
        d_model (int): the size of input for the first-layer of the FFN.
@@ -770,8 +802,7 @@ class Translator(object):
        self.max_length = args.max_length

    def translate(self, batch, step, attn_debug=False):
-        """ Generates summaries from one batch of data.
-        """
+        """Generates summaries from one batch of data."""
        self.model.eval()
        with torch.no_grad():
            batch_data = self.translate_batch(batch)
@@ -798,8 +829,7 @@ class Translator(object):
    # Where the beam search lives
    # I have no idea why it is being called from the method above
    def _fast_translate_batch(self, batch, max_length, min_length=0):
-        """ Beam Search using the encoder inputs contained in `batch`.
-        """
+        """Beam Search using the encoder inputs contained in `batch`."""

        # The batch object is funny
        # Instead of just looking at the size of the arguments we encapsulate
@@ -981,7 +1011,7 @@ def tile(x, count, dim=0):


 class BertSumOptimizer(object):
-    """ Specific optimizer for BertSum.
+    """Specific optimizer for BertSum.

    As described in [1], the authors fine-tune BertSum for abstractive
    summarization using two Adam Optimizers with different warm-up steps and
@@ -999,10 +1029,16 @@ class BertSumOptimizer(object):

        self.optimizers = {
            "encoder": torch.optim.Adam(
-                model.encoder.parameters(), lr=lr["encoder"], betas=(beta_1, beta_2), eps=eps,
+                model.encoder.parameters(),
+                lr=lr["encoder"],
+                betas=(beta_1, beta_2),
+                eps=eps,
            ),
            "decoder": torch.optim.Adam(
-                model.decoder.parameters(), lr=lr["decoder"], betas=(beta_1, beta_2), eps=eps,
+                model.decoder.parameters(),
+                lr=lr["decoder"],
+                betas=(beta_1, beta_2),
+                eps=eps,
            ),
        }

--- a/examples/seq2seq/bertabs/run_summarization.py
+++ b/examples/seq2seq/bertabs/run_summarization.py
@@ -99,7 +99,7 @@ def evaluate(args):


 def save_summaries(summaries, path, original_document_name):
-    """ Write the summaries in fies that are prefixed by the original
+    """Write the summaries in fies that are prefixed by the original
    files' name with the `_summary` appended.

    Attributes:
@@ -125,7 +125,7 @@ def save_summaries(summaries, path, original_document_name):


 def format_summary(translation):
-    """ Transforms the output of the `from_batch` function
+    """Transforms the output of the `from_batch` function
    into nicely formatted summaries.
    """
    raw_summary, _, _ = translation
@@ -190,7 +190,12 @@ def build_data_iterator(args, tokenizer):
    def collate_fn(data):
        return collate(data, tokenizer, block_size=512, device=args.device)

-    iterator = DataLoader(dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,)
+    iterator = DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=args.batch_size,
+        collate_fn=collate_fn,
+    )

    return iterator

@@ -201,7 +206,7 @@ def load_and_cache_examples(args, tokenizer):


 def collate(data, tokenizer, block_size, device):
-    """ Collate formats the data passed to the data loader.
+    """Collate formats the data passed to the data loader.

    In particular we tokenize the data batch after batch to avoid keeping them
    all in memory. We output the data as a namedtuple to fit the original BertAbs's
@@ -231,7 +236,7 @@ def collate(data, tokenizer, block_size, device):


 def decode_summary(summary_tokens, tokenizer):
-    """ Decode the summary and return it in a format
+    """Decode the summary and return it in a format
    suitable for evaluation.
    """
    summary_tokens = summary_tokens.to("cpu").numpy()
@@ -242,8 +247,7 @@ def decode_summary(summary_tokens, tokenizer):


 def main():
-    """ The main function defines the interface with the users.
-    """
+    """The main function defines the interface with the users."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--documents_dir",
@@ -268,23 +272,41 @@ def main():
    )
    # EVALUATION options
    parser.add_argument(
-        "--no_cuda", default=False, type=bool, help="Whether to force the execution on CPU.",
+        "--no_cuda",
+        default=False,
+        type=bool,
+        help="Whether to force the execution on CPU.",
    )
    parser.add_argument(
-        "--batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.",
+        "--batch_size",
+        default=4,
+        type=int,
+        help="Batch size per GPU/CPU for training.",
    )
    # BEAM SEARCH arguments
    parser.add_argument(
-        "--min_length", default=50, type=int, help="Minimum number of tokens for the summaries.",
+        "--min_length",
+        default=50,
+        type=int,
+        help="Minimum number of tokens for the summaries.",
    )
    parser.add_argument(
-        "--max_length", default=200, type=int, help="Maixmum number of tokens for the summaries.",
+        "--max_length",
+        default=200,
+        type=int,
+        help="Maixmum number of tokens for the summaries.",
    )
    parser.add_argument(
-        "--beam_size", default=5, type=int, help="The number of beams to start with for each example.",
+        "--beam_size",
+        default=5,
+        type=int,
+        help="The number of beams to start with for each example.",
    )
    parser.add_argument(
-        "--alpha", default=0.95, type=float, help="The value of alpha for the length penalty in the beam search.",
+        "--alpha",
+        default=0.95,
+        type=float,
+        help="The value of alpha for the length penalty in the beam search.",
    )
    parser.add_argument(
        "--block_trigram",
--- a/examples/seq2seq/bertabs/test_utils_summarization.py
+++ b/examples/seq2seq/bertabs/test_utils_summarization.py
@@ -43,8 +43,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
        self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)

    def test_process_story_no_highlights(self):
-        """ Processing a story with no highlights returns an empty list for the summary.
-        """
+        """Processing a story with no highlights returns an empty list for the summary."""
        raw_story = """It was the year of Our Lord one thousand seven hundred and
        seventy-five.\n\nSpiritual revelations were conceded to England at that
        favoured period, as at this."""
@@ -52,8 +51,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
        self.assertEqual(summary_lines, [])

    def test_process_empty_story(self):
-        """ An empty story returns an empty collection of lines.
-        """
+        """An empty story returns an empty collection of lines."""
        raw_story = ""
        story_lines, summary_lines = process_story(raw_story)
        self.assertEqual(story_lines, [])
--- a/examples/seq2seq/bertabs/utils_summarization.py
+++ b/examples/seq2seq/bertabs/utils_summarization.py
@@ -11,7 +11,7 @@ from torch.utils.data import Dataset


 class CNNDMDataset(Dataset):
-    """ Abstracts the dataset used to train seq2seq models.
+    """Abstracts the dataset used to train seq2seq models.

    The class will process the documents that are located in the specified
    folder. The preprocessing will work on any document that is reasonably
@@ -31,7 +31,7 @@ class CNNDMDataset(Dataset):
    """

    def __init__(self, path="", prefix="train"):
-        """ We initialize the class by listing all the documents to summarize.
+        """We initialize the class by listing all the documents to summarize.
        Files are not read in memory due to the size of some datasets (like CNN/DailyMail).
        """
        assert os.path.isdir(path)
@@ -60,7 +60,7 @@ class CNNDMDataset(Dataset):


 def process_story(raw_story):
-    """ Extract the story and summary from a story file.
+    """Extract the story and summary from a story file.

    Arguments:
        raw_story (str): content of the story file as an utf-8 encoded string.
@@ -108,7 +108,7 @@ def _add_missing_period(line):


 def truncate_or_pad(sequence, block_size, pad_token_id):
-    """ Adapt the source and target sequences' lengths to the block size.
+    """Adapt the source and target sequences' lengths to the block size.
    If the sequence is shorter we append padding token to the right of the sequence.
    """
    if len(sequence) > block_size:
@@ -119,8 +119,8 @@ def truncate_or_pad(sequence, block_size, pad_token_id):


 def build_mask(sequence, pad_token_id):
-    """ Builds the mask. The attention mechanism will only attend to positions
-    with value 1. """
+    """Builds the mask. The attention mechanism will only attend to positions
+    with value 1."""
    mask = torch.ones_like(sequence)
    idx_pad_tokens = sequence == pad_token_id
    mask[idx_pad_tokens] = 0
@@ -128,7 +128,7 @@ def build_mask(sequence, pad_token_id):


 def encode_for_summarization(story_lines, summary_lines, tokenizer):
-    """ Encode the story and summary lines, and join them
+    """Encode the story and summary lines, and join them
    as specified in [1] by using `[SEP] [CLS]` tokens to separate
    sentences.
    """
@@ -141,7 +141,7 @@ def encode_for_summarization(story_lines, summary_lines, tokenizer):


 def compute_token_type_ids(batch, separator_token_id):
-    """ Segment embeddings as described in [1]
+    """Segment embeddings as described in [1]

    The values {0,1} were found in the repository [2].

--- a/examples/seq2seq/callbacks.py
+++ b/examples/seq2seq/callbacks.py
@@ -97,4 +97,9 @@ def get_checkpoint_callback(output_dir, metric):


 def get_early_stopping_callback(metric, patience):
-    return EarlyStopping(monitor=f"val_{metric}", mode="max", patience=patience, verbose=True,)
+    return EarlyStopping(
+        monitor=f"val_{metric}",
+        mode="max",
+        patience=patience,
+        verbose=True,
+    )
--- a/examples/seq2seq/distillation.py
+++ b/examples/seq2seq/distillation.py
@@ -348,7 +348,10 @@ class T5SummarizationDistiller(BartSummarizationDistiller):
        if self.different_encoder:
            with torch.no_grad():
                teacher_enc_outputs, teacher_enc_hid = self.teacher.encoder(
-                    source_ids, attention_mask=source_mask, output_hidden_states=True, use_cache=False,
+                    source_ids,
+                    attention_mask=source_mask,
+                    output_hidden_states=True,
+                    use_cache=False,
                )
            if self.hparams.alpha_encoder_loss > 0:
                loss_encoder = self.calc_mse_loss(enc_outputs, teacher_enc_outputs, source_mask)
--- a/examples/seq2seq/test_seq2seq_examples.py
+++ b/examples/seq2seq/test_seq2seq_examples.py
@@ -117,7 +117,12 @@ class TestSummarizationDistiller(unittest.TestCase):

    @require_multigpu
    def test_multigpu(self):
-        updates = dict(no_teacher=True, freeze_encoder=True, gpus=2, sortish_sampler=False,)
+        updates = dict(
+            no_teacher=True,
+            freeze_encoder=True,
+            gpus=2,
+            sortish_sampler=False,
+        )
        self._test_distiller_cli(updates)

    def test_distill_no_teacher(self):
@@ -261,7 +266,8 @@ def test_run_eval_bart(model):


@pytest.mark.parametrize(
-    ["model"], [pytest.param(T5_TINY), pytest.param(BART_TINY), pytest.param(MBART_TINY), pytest.param(MARIAN_TINY)],
+    ["model"],
+    [pytest.param(T5_TINY), pytest.param(BART_TINY), pytest.param(MBART_TINY), pytest.param(MARIAN_TINY)],
 )
 def test_finetune(model):
    args_d: dict = CHEAP_ARGS.copy()
@@ -329,7 +335,8 @@ def test_finetune_extra_model_args():
    output_dir = tempfile.mkdtemp(prefix="output_1_")
    args_d1 = args_d.copy()
    args_d1.update(
-        model_name_or_path=model, output_dir=output_dir,
+        model_name_or_path=model,
+        output_dir=output_dir,
    )
    extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
    for p in extra_model_params:
@@ -344,7 +351,8 @@ def test_finetune_extra_model_args():
    output_dir = tempfile.mkdtemp(prefix="output_2_")
    args_d2 = args_d.copy()
    args_d2.update(
-        model_name_or_path=model, output_dir=output_dir,
+        model_name_or_path=model,
+        output_dir=output_dir,
    )
    unsupported_param = "encoder_layerdrop"
    args_d2[unsupported_param] = 0.5
@@ -478,7 +486,11 @@ def test_summarization_dataset_truncation(tok):
    max_len_target = max(len(tokenizer.encode(a)) for a in SUMMARIES)
    trunc_target = 4
    train_dataset = Seq2SeqDataset(
-        tokenizer, data_dir=tmp_dir, type_path="train", max_source_length=20, max_target_length=trunc_target,
+        tokenizer,
+        data_dir=tmp_dir,
+        type_path="train",
+        max_source_length=20,
+        max_target_length=trunc_target,
    )
    dataloader = DataLoader(train_dataset, batch_size=2, collate_fn=train_dataset.collate_fn)
    for batch in dataloader:
--- a/examples/seq2seq/utils.py
+++ b/examples/seq2seq/utils.py
@@ -63,7 +63,9 @@ def calculate_bleu(output_lns, refs_lns, **kwargs) -> dict:


 def trim_batch(
-    input_ids, pad_token_id, attention_mask=None,
+    input_ids,
+    pad_token_id,
+    attention_mask=None,
 ):
    """Remove columns that are populated exclusively by pad_token_id"""
    keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
--- a/examples/text-classification/run_pl_glue.py
+++ b/examples/text-classification/run_pl_glue.py
@@ -153,7 +153,11 @@ class GLUETransformer(BaseTransformer):
        )

        parser.add_argument(
-            "--task", default="", type=str, required=True, help="The GLUE task to run",
+            "--task",
+            default="",
+            type=str,
+            required=True,
+            help="The GLUE task to run",
        )
        parser.add_argument(
            "--gpus",
@@ -177,7 +181,10 @@ def main():

    # If output_dir not provided, a folder will be generated in pwd
    if args.output_dir is None:
-        args.output_dir = os.path.join("./results", f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",)
+        args.output_dir = os.path.join(
+            "./results",
+            f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
+        )
        os.makedirs(args.output_dir)

    model = GLUETransformer(args)
--- a/examples/text-classification/run_xnli.py
+++ b/examples/text-classification/run_xnli.py
@@ -328,7 +328,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
            processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        )
        features = convert_examples_to_features(
-            examples, tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode=output_mode,
+            examples,
+            tokenizer,
+            max_length=args.max_seq_length,
+            label_list=label_list,
+            output_mode=output_mode,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
--- a/examples/text-generation/pplm/run_pplm.py
+++ b/examples/text-generation/pplm/run_pplm.py
@@ -698,7 +698,9 @@ def run_pplm_example(
                for word_id in pert_gen_tok_text.tolist()[0]:
                    if word_id in bow_word_ids:
                        pert_gen_text += "{}{}{}".format(
-                            colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL,
+                            colorama.Fore.RED,
+                            tokenizer.decode([word_id]),
+                            colorama.Style.RESET_ALL,
                        )
                    else:
                        pert_gen_text += tokenizer.decode([word_id])
@@ -729,7 +731,10 @@ if __name__ == "__main__":
    parser.add_argument("--cond_text", type=str, default="The lake", help="Prefix texts to condition on")
    parser.add_argument("--uncond", action="store_true", help="Generate from end-of-text as prefix")
    parser.add_argument(
-        "--num_samples", type=int, default=1, help="Number of samples to generate from the modified latents",
+        "--num_samples",
+        type=int,
+        default=1,
+        help="Number of samples to generate from the modified latents",
    )
    parser.add_argument(
        "--bag_of_words",
@@ -751,13 +756,22 @@ if __name__ == "__main__":
        help="Discriminator to use",
    )
    parser.add_argument(
-        "--discrim_weights", type=str, default=None, help="Weights for the generic discriminator",
+        "--discrim_weights",
+        type=str,
+        default=None,
+        help="Weights for the generic discriminator",
    )
    parser.add_argument(
-        "--discrim_meta", type=str, default=None, help="Meta information for the generic discriminator",
+        "--discrim_meta",
+        type=str,
+        default=None,
+        help="Meta information for the generic discriminator",
    )
    parser.add_argument(
-        "--class_label", type=int, default=-1, help="Class label used for the discriminator",
+        "--class_label",
+        type=int,
+        default=-1,
+        help="Class label used for the discriminator",
    )
    parser.add_argument("--length", type=int, default=100)
    parser.add_argument("--stepsize", type=float, default=0.02)
@@ -773,7 +787,10 @@ if __name__ == "__main__":
        help="Length of past which is being optimized; 0 corresponds to infinite window length",
    )
    parser.add_argument(
-        "--horizon_length", type=int, default=1, help="Length of future to optimize over",
+        "--horizon_length",
+        type=int,
+        default=1,
+        help="Length of future to optimize over",
    )
    parser.add_argument("--decay", action="store_true", help="whether to decay or not")
    parser.add_argument("--gamma", type=float, default=1.5)
@@ -783,7 +800,10 @@ if __name__ == "__main__":
    parser.add_argument("--no_cuda", action="store_true", help="no cuda")
    parser.add_argument("--colorama", action="store_true", help="colors keywords")
    parser.add_argument(
-        "--repetition_penalty", type=float, default=1.0, help="Penalize repetition. More than 1.0 -> less repetition",
+        "--repetition_penalty",
+        type=float,
+        default=1.0,
+        help="Penalize repetition. More than 1.0 -> less repetition",
    )

    args = parser.parse_args()
--- a/examples/text-generation/pplm/run_pplm_discrim_train.py
+++ b/examples/text-generation/pplm/run_pplm_discrim_train.py
@@ -242,7 +242,12 @@ def train_discriminator(

        text = torchtext_data.Field()
        label = torchtext_data.Field(sequential=False)
-        train_data, val_data, test_data = datasets.SST.splits(text, label, fine_grained=True, train_subtrees=True,)
+        train_data, val_data, test_data = datasets.SST.splits(
+            text,
+            label,
+            fine_grained=True,
+            train_subtrees=True,
+        )

        x = []
        y = []
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -41,7 +41,9 @@ from transformers import (


 logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
 )
 logger = logging.getLogger(__name__)

@@ -197,7 +199,10 @@ def main():
    args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()

    logger.warning(
-        "device: %s, n_gpu: %s, 16-bits training: %s", args.device, args.n_gpu, args.fp16,
+        "device: %s, n_gpu: %s, 16-bits training: %s",
+        args.device,
+        args.n_gpu,
+        args.fp16,
    )

    set_seed(args)
--- a/examples/token-classification/utils_ner.py
+++ b/examples/token-classification/utils_ner.py
@@ -90,11 +90,11 @@ class TokenClassificationTask:
        sequence_a_segment_id=0,
        mask_padding_with_zero=True,
    ) -> List[InputFeatures]:
-        """ Loads a data file into a list of `InputFeatures`
-            `cls_token_at_end` define the location of the CLS token:
-                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
-                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
-            `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
+        """Loads a data file into a list of `InputFeatures`
+        `cls_token_at_end` define the location of the CLS token:
+            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
+            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
+        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
        """
        # TODO clean up all this to leverage built-in features of tokenizers

@@ -230,7 +230,8 @@ if is_torch_available():
        ):
            # Load data features from cache or dataset file
            cached_features_file = os.path.join(
-                data_dir, "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
+                data_dir,
+                "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
            )

            # Make sure only the first process in distributed training processes the dataset,