diff --git a/examples/legacy/question-answering/run_squad.py b/examples/legacy/question-answering/run_squad.py
index 84986eff6f..fd50bf06b7 100644
--- a/examples/legacy/question-answering/run_squad.py
+++ b/examples/legacy/question-answering/run_squad.py
@@ -74,7 +74,7 @@ def to_list(tensor):
 
 
 def train(args, train_dataset, model, tokenizer):
-    """ Train the model """
+    """Train the model"""
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter()
 
diff --git a/examples/legacy/run_openai_gpt.py b/examples/legacy/run_openai_gpt.py
index 72314b5edb..1c0c189420 100755
--- a/examples/legacy/run_openai_gpt.py
+++ b/examples/legacy/run_openai_gpt.py
@@ -61,7 +61,7 @@ def accuracy(out, labels):
 
 
 def load_rocstories_dataset(dataset_path):
-    """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
+    """Output a list of tuples(story, 1st continuation, 2nd continuation, label)"""
     with open(dataset_path, encoding="utf_8") as f:
         f = csv.reader(f)
         output = []
@@ -184,7 +184,7 @@ def main():
 
     # Load and encode the datasets
     def tokenize_and_encode(obj):
-        """ Tokenize and encode a nested object """
+        """Tokenize and encode a nested object"""
         if isinstance(obj, str):
             return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
         elif isinstance(obj, int):
diff --git a/examples/legacy/run_swag.py b/examples/legacy/run_swag.py
index ddce4d20e2..666c1becb3 100755
--- a/examples/legacy/run_swag.py
+++ b/examples/legacy/run_swag.py
@@ -276,7 +276,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
 
 
 def train(args, train_dataset, model, tokenizer):
-    """ Train the model """
+    """Train the model"""
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter()
 
diff --git a/examples/legacy/seq2seq/minify_dataset.py b/examples/legacy/seq2seq/minify_dataset.py
index 8fd03196a0..e6095cecc8 100755
--- a/examples/legacy/seq2seq/minify_dataset.py
+++ b/examples/legacy/seq2seq/minify_dataset.py
@@ -19,7 +19,7 @@ import fire
 
 
 def minify(src_dir: str, dest_dir: str, n: int):
-    """Write first n lines of each file f in src_dir to dest_dir/f """
+    """Write first n lines of each file f in src_dir to dest_dir/f"""
     src_dir = Path(src_dir)
     dest_dir = Path(dest_dir)
     dest_dir.mkdir(exist_ok=True)
diff --git a/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py b/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
index 1ac84f28d3..0366366d71 100755
--- a/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
+++ b/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
@@ -71,7 +71,7 @@ def set_seed(args):
 
 
 def train(args, train_dataset, model, tokenizer):
-    """ Train the model """
+    """Train the model"""
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter()
 
diff --git a/examples/research_projects/bertabs/modeling_bertabs.py b/examples/research_projects/bertabs/modeling_bertabs.py
index ce0e25e2b1..a7d8611a26 100644
--- a/examples/research_projects/bertabs/modeling_bertabs.py
+++ b/examples/research_projects/bertabs/modeling_bertabs.py
@@ -251,7 +251,7 @@ class TransformerDecoder(nn.Module):
         return output, state  # , state
 
     def init_decoder_state(self, src, memory_bank, with_cache=False):
-        """ Init decoder state """
+        """Init decoder state"""
         state = TransformerDecoderState(src)
         if with_cache:
             state._init_cache(memory_bank, self.num_layers)
@@ -479,11 +479,11 @@ class MultiHeadedAttention(nn.Module):
         head_count = self.head_count
 
         def shape(x):
-            """  projection """
+            """projection"""
             return x.view(batch_size, -1, head_count, dim_per_head).transpose(1, 2)
 
         def unshape(x):
-            """  compute context """
+            """compute context"""
             return x.transpose(1, 2).contiguous().view(batch_size, -1, head_count * dim_per_head)
 
         # 1) Project key, value, and query.
@@ -571,12 +571,12 @@ class DecoderState(object):
     """
 
     def detach(self):
-        """ Need to document this """
+        """Need to document this"""
         self.hidden = tuple([_.detach() for _ in self.hidden])
         self.input_feed = self.input_feed.detach()
 
     def beam_update(self, idx, positions, beam_size):
-        """ Need to document this """
+        """Need to document this"""
         for e in self._all:
             sizes = e.size()
             br = sizes[1]
@@ -592,7 +592,7 @@ class DecoderState(object):
 
 
 class TransformerDecoderState(DecoderState):
-    """ Transformer Decoder state base class """
+    """Transformer Decoder state base class"""
 
     def __init__(self, src):
         """
@@ -638,7 +638,7 @@ class TransformerDecoderState(DecoderState):
             self.cache["layer_{}".format(l)] = layer_cache
 
     def repeat_beam_size_times(self, beam_size):
-        """ Repeat beam_size times along batch dimension. """
+        """Repeat beam_size times along batch dimension."""
         self.src = self.src.data.repeat(1, beam_size, 1)
 
     def map_batch_fn(self, fn):
diff --git a/examples/research_projects/bertabs/test_utils_summarization.py b/examples/research_projects/bertabs/test_utils_summarization.py
index 5af0898246..18120c9063 100644
--- a/examples/research_projects/bertabs/test_utils_summarization.py
+++ b/examples/research_projects/bertabs/test_utils_summarization.py
@@ -25,19 +25,19 @@ class SummarizationDataProcessingTest(unittest.TestCase):
         self.block_size = 10
 
     def test_fit_to_block_sequence_too_small(self):
-        """ Pad the sequence with 0 if the sequence is smaller than the block size."""
+        """Pad the sequence with 0 if the sequence is smaller than the block size."""
         sequence = [1, 2, 3, 4]
         expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]
         self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)
 
     def test_fit_to_block_sequence_fit_exactly(self):
-        """ Do nothing if the sequence is the right size. """
+        """Do nothing if the sequence is the right size."""
         sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
         expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
         self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)
 
     def test_fit_to_block_sequence_too_big(self):
-        """ Truncate the sequence if it is too long. """
+        """Truncate the sequence if it is too long."""
         sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
         expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
         self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)
diff --git a/examples/research_projects/bertabs/utils_summarization.py b/examples/research_projects/bertabs/utils_summarization.py
index 11ce599429..716365336b 100644
--- a/examples/research_projects/bertabs/utils_summarization.py
+++ b/examples/research_projects/bertabs/utils_summarization.py
@@ -47,7 +47,7 @@ class CNNDMDataset(Dataset):
             self.documents.append(path_to_story)
 
     def __len__(self):
-        """ Returns the number of documents. """
+        """Returns the number of documents."""
         return len(self.documents)
 
     def __getitem__(self, idx):
diff --git a/examples/research_projects/bertology/run_bertology.py b/examples/research_projects/bertology/run_bertology.py
index d0eef30430..fb1c24e5bc 100644
--- a/examples/research_projects/bertology/run_bertology.py
+++ b/examples/research_projects/bertology/run_bertology.py
@@ -49,14 +49,14 @@ logger = logging.getLogger(__name__)
 
 
 def entropy(p):
-    """ Compute the entropy of a probability distribution """
+    """Compute the entropy of a probability distribution"""
     plogp = p * torch.log(p)
     plogp[p == 0] = 0
     return -plogp.sum(dim=-1)
 
 
 def print_2d_tensor(tensor):
-    """ Print a 2D tensor """
+    """Print a 2D tensor"""
     logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor))))
     for row in range(len(tensor)):
         if tensor.dtype != torch.long:
diff --git a/examples/research_projects/bertology/run_prune_gpt.py b/examples/research_projects/bertology/run_prune_gpt.py
index 7e88f3081e..5dbabe3912 100644
--- a/examples/research_projects/bertology/run_prune_gpt.py
+++ b/examples/research_projects/bertology/run_prune_gpt.py
@@ -36,7 +36,7 @@ def save_model(model, dirpath):
 
 
 def entropy(p, unlogit=False):
-    """ Compute the entropy of a probability distribution """
+    """Compute the entropy of a probability distribution"""
     exponent = 2
     if unlogit:
         p = torch.pow(p, exponent)
@@ -46,7 +46,7 @@ def entropy(p, unlogit=False):
 
 
 def print_2d_tensor(tensor):
-    """ Print a 2D tensor """
+    """Print a 2D tensor"""
     logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor))))
     for row in range(len(tensor)):
         if tensor.dtype != torch.long:
diff --git a/examples/research_projects/deebert/run_glue_deebert.py b/examples/research_projects/deebert/run_glue_deebert.py
index 7e415d0939..97ae17faab 100644
--- a/examples/research_projects/deebert/run_glue_deebert.py
+++ b/examples/research_projects/deebert/run_glue_deebert.py
@@ -70,7 +70,7 @@ def get_wanted_result(result):
 
 
 def train(args, train_dataset, model, tokenizer, train_highway=False):
-    """ Train the model """
+    """Train the model"""
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter()
 
diff --git a/examples/research_projects/distillation/run_squad_w_distillation.py b/examples/research_projects/distillation/run_squad_w_distillation.py
index 3429bf1cbe..1c7256fccf 100644
--- a/examples/research_projects/distillation/run_squad_w_distillation.py
+++ b/examples/research_projects/distillation/run_squad_w_distillation.py
@@ -92,7 +92,7 @@ def to_list(tensor):
 
 
 def train(args, train_dataset, model, tokenizer, teacher=None):
-    """ Train the model """
+    """Train the model"""
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter()
 
diff --git a/examples/research_projects/mm-imdb/run_mmimdb.py b/examples/research_projects/mm-imdb/run_mmimdb.py
index d948a5a62d..4157d2e9cf 100644
--- a/examples/research_projects/mm-imdb/run_mmimdb.py
+++ b/examples/research_projects/mm-imdb/run_mmimdb.py
@@ -64,7 +64,7 @@ def set_seed(args):
 
 
 def train(args, train_dataset, model, tokenizer, criterion):
-    """ Train the model """
+    """Train the model"""
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter()
 
diff --git a/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py b/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py
index c686d39e34..0f4803cdd5 100644
--- a/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py
+++ b/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py
@@ -393,7 +393,7 @@ class MaskedBertPreTrainedModel(PreTrainedModel):
     base_model_prefix = "bert"
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, (nn.Linear, nn.Embedding)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/examples/research_projects/movement-pruning/masked_run_glue.py b/examples/research_projects/movement-pruning/masked_run_glue.py
index 0657aa24ce..48605ee053 100644
--- a/examples/research_projects/movement-pruning/masked_run_glue.py
+++ b/examples/research_projects/movement-pruning/masked_run_glue.py
@@ -105,7 +105,7 @@ def regularization(model: nn.Module, mode: str):
 
 
 def train(args, train_dataset, model, tokenizer, teacher=None):
-    """ Train the model """
+    """Train the model"""
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter(log_dir=args.output_dir)
 
diff --git a/examples/research_projects/movement-pruning/masked_run_squad.py b/examples/research_projects/movement-pruning/masked_run_squad.py
index 9fd219c089..56f26eff10 100644
--- a/examples/research_projects/movement-pruning/masked_run_squad.py
+++ b/examples/research_projects/movement-pruning/masked_run_squad.py
@@ -113,7 +113,7 @@ def to_list(tensor):
 
 
 def train(args, train_dataset, model, tokenizer, teacher=None):
-    """ Train the model """
+    """Train the model"""
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter(log_dir=args.output_dir)
 
diff --git a/src/transformers/commands/lfs.py b/src/transformers/commands/lfs.py
index 42b00f0d2f..9d8f90502f 100644
--- a/src/transformers/commands/lfs.py
+++ b/src/transformers/commands/lfs.py
@@ -96,7 +96,7 @@ def write_msg(msg: Dict):
 
 
 def read_msg() -> Optional[Dict]:
-    """Read Line delimited JSON from stdin. """
+    """Read Line delimited JSON from stdin."""
     msg = json.loads(sys.stdin.readline().strip())
 
     if "terminate" in (msg.get("type"), msg.get("event")):
diff --git a/src/transformers/data/processors/utils.py b/src/transformers/data/processors/utils.py
index 06db91f7e2..a5a04266a0 100644
--- a/src/transformers/data/processors/utils.py
+++ b/src/transformers/data/processors/utils.py
@@ -124,7 +124,7 @@ class DataProcessor:
 
 
 class SingleSentenceClassificationProcessor(DataProcessor):
-    """ Generic processor for a single sentence classification data set."""
+    """Generic processor for a single sentence classification data set."""
 
     def __init__(self, labels=None, examples=None, mode="classification", verbose=False):
         self.labels = [] if labels is None else labels
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 392728fdf0..d9ee0378cc 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -1462,7 +1462,7 @@ def tf_required(func):
 
 
 def is_tensor(x):
-    """ Tests if ``x`` is a :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`. """
+    """Tests if ``x`` is a :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`."""
     if is_torch_available():
         import torch
 
@@ -1684,7 +1684,7 @@ class _BaseLazyModule(ModuleType):
 
 
 def copy_func(f):
-    """ Returns a copy of a function f."""
+    """Returns a copy of a function f."""
     # Based on http://stackoverflow.com/a/6528148/190597 (Glenn Maynard)
     g = types.FunctionType(f.__code__, f.__globals__, name=f.__name__, argdefs=f.__defaults__, closure=f.__closure__)
     g = functools.update_wrapper(g, f)
diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index 38316de881..97fdf1903a 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -215,6 +215,6 @@ class ModelCard:
         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 
     def to_json_file(self, json_file_path):
-        """ Save this instance to a json file."""
+        """Save this instance to a json file."""
         with open(json_file_path, "w", encoding="utf-8") as writer:
             writer.write(self.to_json_string())
diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py
index 21da03fd7a..a753c58088 100755
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -71,7 +71,7 @@ ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
 
 def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model."""
+    """Load tf checkpoints in a pytorch model."""
     try:
         import re
 
diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py
index 64be5062c8..c750705ee6 100644
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@@ -189,7 +189,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
 
 
 class TFAlbertAttention(tf.keras.layers.Layer):
-    """ Contains the complete attention sublayer, including both dropouts and layer norm. """
+    """Contains the complete attention sublayer, including both dropouts and layer norm."""
 
     def __init__(self, config: AlbertConfig, **kwargs):
         super().__init__(**kwargs)
diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py
index 92c06bbcde..493a5e145a 100644
--- a/src/transformers/models/albert/tokenization_albert.py
+++ b/src/transformers/models/albert/tokenization_albert.py
@@ -187,7 +187,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
         return outputs
 
     def _tokenize(self, text, sample=False):
-        """ Tokenize a string. """
+        """Tokenize a string."""
         text = self.preprocess_text(text)
 
         if not sample:
@@ -211,7 +211,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
         return new_pieces
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         return self.sp_model.PieceToId(token)
 
     def _convert_id_to_token(self, index):
diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py
index 641cc80c1d..95d64cfa28 100644
--- a/src/transformers/models/barthez/tokenization_barthez.py
+++ b/src/transformers/models/barthez/tokenization_barthez.py
@@ -223,7 +223,7 @@ class BarthezTokenizer(PreTrainedTokenizer):
         return self.sp_model.EncodeAsPieces(text)
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         if token in self.fairseq_tokens_to_ids:
             return self.fairseq_tokens_to_ids[token]
         spm_id = self.sp_model.PieceToId(token)
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index a1176f3a4a..34dd5329be 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -703,7 +703,7 @@ class BertPreTrainedModel(PreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, nn.Linear):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py
index fbb2cfc029..897fb32761 100644
--- a/src/transformers/models/bert/tokenization_bert.py
+++ b/src/transformers/models/bert/tokenization_bert.py
@@ -233,7 +233,7 @@ class BertTokenizer(PreTrainedTokenizer):
         return split_tokens
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         return self.vocab.get(token, self.vocab.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
@@ -241,7 +241,7 @@ class BertTokenizer(PreTrainedTokenizer):
         return self.ids_to_tokens.get(index, self.unk_token)
 
     def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
+        """Converts a sequence of tokens (string) in a single string."""
         out_string = " ".join(tokens).replace(" ##", "").strip()
         return out_string
 
diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py
index 6f366c7f42..dad2d1cece 100755
--- a/src/transformers/models/bert_generation/modeling_bert_generation.py
+++ b/src/transformers/models/bert_generation/modeling_bert_generation.py
@@ -177,7 +177,7 @@ class BertGenerationPreTrainedModel(PreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, nn.Linear):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/src/transformers/models/bert_generation/tokenization_bert_generation.py b/src/transformers/models/bert_generation/tokenization_bert_generation.py
index 42b5fcac8e..795d5f504c 100644
--- a/src/transformers/models/bert_generation/tokenization_bert_generation.py
+++ b/src/transformers/models/bert_generation/tokenization_bert_generation.py
@@ -119,7 +119,7 @@ class BertGenerationTokenizer(PreTrainedTokenizer):
         return pieces
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         return self.sp_model.piece_to_id(token)
 
     def _convert_id_to_token(self, index):
@@ -128,7 +128,7 @@ class BertGenerationTokenizer(PreTrainedTokenizer):
         return token
 
     def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
+        """Converts a sequence of tokens (string) in a single string."""
         out_string = self.sp_model.decode_pieces(tokens)
         return out_string
 
diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py
index bf110274da..b2fcfb5318 100644
--- a/src/transformers/models/bertweet/tokenization_bertweet.py
+++ b/src/transformers/models/bertweet/tokenization_bertweet.py
@@ -368,7 +368,7 @@ class BertweetTokenizer(PreTrainedTokenizer):
             return token
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         return self.encoder.get(token, self.encoder.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
@@ -376,7 +376,7 @@ class BertweetTokenizer(PreTrainedTokenizer):
         return self.decoder.get(index, self.unk_token)
 
     def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
+        """Converts a sequence of tokens (string) in a single string."""
         out_string = " ".join(tokens).replace("@@ ", "").strip()
         return out_string
 
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index e745ca9cee..cdef9ed00b 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -484,7 +484,7 @@ class BigBirdBlockSparseAttention(nn.Module):
 
     @staticmethod
     def torch_bmm_nd(inp_1, inp_2, ndim=None):
-        """ Fast nd matrix multiplication """
+        """Fast nd matrix multiplication"""
         # faster replacement of torch.einsum ("bhqk,bhkd->bhqd")
         return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
             inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 1])
@@ -492,7 +492,7 @@ class BigBirdBlockSparseAttention(nn.Module):
 
     @staticmethod
     def torch_bmm_nd_transpose(inp_1, inp_2, ndim=None):
-        """ Fast nd matrix multiplication with transpose """
+        """Fast nd matrix multiplication with transpose"""
         # faster replacement of torch.einsum (bhqd,bhkd->bhqk)
         return torch.bmm(
             inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
@@ -1743,7 +1743,7 @@ class BigBirdPreTrainedModel(PreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, nn.Linear):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/src/transformers/models/big_bird/tokenization_big_bird.py b/src/transformers/models/big_bird/tokenization_big_bird.py
index a2c3dd023a..e3e5a93f6d 100644
--- a/src/transformers/models/big_bird/tokenization_big_bird.py
+++ b/src/transformers/models/big_bird/tokenization_big_bird.py
@@ -149,7 +149,7 @@ class BigBirdTokenizer(PreTrainedTokenizer):
         return pieces
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         return self.sp_model.piece_to_id(token)
 
     def _convert_id_to_token(self, index):
@@ -158,7 +158,7 @@ class BigBirdTokenizer(PreTrainedTokenizer):
         return token
 
     def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
+        """Converts a sequence of tokens (string) in a single string."""
         out_string = self.sp_model.decode_pieces(tokens)
         return out_string
 
diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
index 1af143f380..1b8104e924 100644
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
@@ -183,7 +183,7 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
         return " ".join(words)
 
     def _tokenize(self, text: str) -> List[str]:
-        """ Split a string into tokens using BPE."""
+        """Split a string into tokens using BPE."""
         split_tokens = []
 
         words = re.findall(r"\S+\n?", text)
@@ -193,7 +193,7 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
         return split_tokens
 
     def _convert_token_to_id(self, token: str) -> int:
-        """ Converts a token to an id using the vocab. """
+        """Converts a token to an id using the vocab."""
         token = token.lower()
         return self.encoder.get(token, self.encoder.get(self.unk_token))
 
@@ -202,7 +202,7 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
         return self.decoder.get(index, self.unk_token)
 
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        """ Converts a sequence of tokens  in a single string. """
+        """Converts a sequence of tokens  in a single string."""
         out_string = " ".join(tokens).replace("@@ ", "").strip()
         return out_string
 
diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py
index 8337d6826c..b7bee4e19c 100644
--- a/src/transformers/models/camembert/tokenization_camembert.py
+++ b/src/transformers/models/camembert/tokenization_camembert.py
@@ -222,7 +222,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
         return self.sp_model.EncodeAsPieces(text)
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         if token in self.fairseq_tokens_to_ids:
             return self.fairseq_tokens_to_ids[token]
         elif self.sp_model.PieceToId(token) == 0:
diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py
index 0ededdc83f..f597ff1789 100755
--- a/src/transformers/models/convbert/modeling_convbert.py
+++ b/src/transformers/models/convbert/modeling_convbert.py
@@ -238,7 +238,7 @@ class ConvBertPreTrainedModel(PreTrainedModel):
     authorized_unexpected_keys = [r"convbert\.embeddings_project\.weight", r"convbert\.embeddings_project\.bias"]
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, nn.Linear):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/src/transformers/models/ctrl/tokenization_ctrl.py b/src/transformers/models/ctrl/tokenization_ctrl.py
index d1adb50087..31ac0637a9 100644
--- a/src/transformers/models/ctrl/tokenization_ctrl.py
+++ b/src/transformers/models/ctrl/tokenization_ctrl.py
@@ -212,7 +212,7 @@ class CTRLTokenizer(PreTrainedTokenizer):
         return split_tokens
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         return self.encoder.get(token, self.encoder.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
@@ -220,7 +220,7 @@ class CTRLTokenizer(PreTrainedTokenizer):
         return self.decoder.get(index, self.unk_token)
 
     def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
+        """Converts a sequence of tokens (string) in a single string."""
         out_string = " ".join(tokens).replace("@@ ", "").strip()
         return out_string
 
diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
index 78509f88d7..cd0c4eaa59 100644
--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
@@ -134,7 +134,7 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
         return self._tokenizer.tokenize(text)
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         return self._tokenizer.spm.PieceToId(token)
 
     def _convert_id_to_token(self, index):
@@ -142,7 +142,7 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
         return self._tokenizer.spm.IdToPiece(index) if index < self.vocab_size else self.unk_token
 
     def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
+        """Converts a sequence of tokens (string) in a single string."""
         return self._tokenizer.decode(tokens)
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py
index 8844d7f656..602d5e2600 100644
--- a/src/transformers/models/deit/modeling_deit.py
+++ b/src/transformers/models/deit/modeling_deit.py
@@ -386,7 +386,7 @@ class DeiTPreTrainedModel(PreTrainedModel):
     base_model_prefix = "deit"
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, (nn.Linear, nn.Conv2d)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index 911fba8088..ca4b42987b 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -167,11 +167,11 @@ class MultiHeadSelfAttention(nn.Module):
         mask_reshp = (bs, 1, 1, k_length)
 
         def shape(x):
-            """ separate heads """
+            """separate heads"""
             return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
 
         def unshape(x):
-            """ group heads """
+            """group heads"""
             return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
 
         q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py
index 8ec0060ab3..2eddbffc14 100644
--- a/src/transformers/models/distilbert/modeling_tf_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py
@@ -175,11 +175,11 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
         mask_reshape = [bs, 1, 1, k_length]
 
         def shape(x):
-            """ separate heads """
+            """separate heads"""
             return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
 
         def unshape(x):
-            """ group heads """
+            """group heads"""
             return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
 
         q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py
index 8f77289fe5..006a22f4c7 100644
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -653,7 +653,7 @@ class ElectraPreTrainedModel(PreTrainedModel):
 
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, nn.Linear):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/src/transformers/models/flaubert/modeling_tf_flaubert.py b/src/transformers/models/flaubert/modeling_tf_flaubert.py
index da2f2d21c7..c6f43a4ced 100644
--- a/src/transformers/models/flaubert/modeling_tf_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py
@@ -342,11 +342,11 @@ class TFFlaubertMultiHeadAttention(tf.keras.layers.Layer):
         mask_reshape = (bs, 1, qlen, klen) if len(shape_list(mask)) == 3 else (bs, 1, 1, klen)
 
         def shape(x):
-            """  projection """
+            """projection"""
             return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
 
         def unshape(x):
-            """  compute context """
+            """compute context"""
             return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
 
         q = shape(self.q_lin(input))  # (bs, n_heads, qlen, dim_per_head)
diff --git a/src/transformers/models/fsmt/tokenization_fsmt.py b/src/transformers/models/fsmt/tokenization_fsmt.py
index 226d18cc3e..ff99d75eeb 100644
--- a/src/transformers/models/fsmt/tokenization_fsmt.py
+++ b/src/transformers/models/fsmt/tokenization_fsmt.py
@@ -374,7 +374,7 @@ class FSMTTokenizer(PreTrainedTokenizer):
         return split_tokens
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         return self.encoder.get(token, self.encoder.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
@@ -382,7 +382,7 @@ class FSMTTokenizer(PreTrainedTokenizer):
         return self.decoder.get(index, self.unk_token)
 
     def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
+        """Converts a sequence of tokens (string) in a single string."""
 
         # remove BPE
         tokens = [t.replace(" ", "").replace("</w>", " ") for t in tokens]
diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py
index 1f277498d1..8e99fb67ae 100644
--- a/src/transformers/models/funnel/modeling_funnel.py
+++ b/src/transformers/models/funnel/modeling_funnel.py
@@ -188,7 +188,7 @@ class FunnelAttentionStructure(nn.Module):
         self.pooling_mult = None
 
     def init_attention_inputs(self, inputs_embeds, attention_mask=None, token_type_ids=None):
-        """ Returns the attention inputs associated to the inputs of the model. """
+        """Returns the attention inputs associated to the inputs of the model."""
         # inputs_embeds has shape batch_size x seq_len x d_model
         # attention_mask and token_type_ids have shape batch_size x seq_len
         self.pooling_mult = 1
@@ -383,7 +383,7 @@ class FunnelAttentionStructure(nn.Module):
         return tensor
 
     def pre_attention_pooling(self, output, attention_inputs):
-        """ Pool `output` and the proper parts of `attention_inputs` before the attention layer. """
+        """Pool `output` and the proper parts of `attention_inputs` before the attention layer."""
         position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
         if self.config.pool_q_only:
             if self.config.attention_type == "factorized":
@@ -403,7 +403,7 @@ class FunnelAttentionStructure(nn.Module):
         return output, attention_inputs
 
     def post_attention_pooling(self, attention_inputs):
-        """ Pool the proper parts of `attention_inputs` after the attention layer. """
+        """Pool the proper parts of `attention_inputs` after the attention layer."""
         position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
         if self.config.pool_q_only:
             self.pooling_mult *= 2
@@ -457,7 +457,7 @@ class FunnelRelMultiheadAttention(nn.Module):
         self.scale = 1.0 / (d_head ** 0.5)
 
     def relative_positional_attention(self, position_embeds, q_head, context_len, cls_mask=None):
-        """ Relative attention score for the positional encodings """
+        """Relative attention score for the positional encodings"""
         # q_head has shape batch_size x sea_len x n_head x d_head
         if self.config.attention_type == "factorized":
             # Notations from the paper, appending A.2.2, final formula (https://arxiv.org/abs/2006.03236)
@@ -499,7 +499,7 @@ class FunnelRelMultiheadAttention(nn.Module):
         return positional_attn
 
     def relative_token_type_attention(self, token_type_mat, q_head, cls_mask=None):
-        """ Relative attention score for the token_type_ids """
+        """Relative attention score for the token_type_ids"""
         if token_type_mat is None:
             return 0
         batch_size, seq_len, context_len = token_type_mat.shape
diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py
index b4e53eafdf..5834bb57d8 100644
--- a/src/transformers/models/funnel/modeling_tf_funnel.py
+++ b/src/transformers/models/funnel/modeling_tf_funnel.py
@@ -139,7 +139,7 @@ class TFFunnelAttentionStructure:
         self.pooling_mult = None
 
     def init_attention_inputs(self, inputs_embeds, attention_mask=None, token_type_ids=None, training=False):
-        """ Returns the attention inputs associated to the inputs of the model. """
+        """Returns the attention inputs associated to the inputs of the model."""
         # inputs_embeds has shape batch_size x seq_len x d_model
         # attention_mask and token_type_ids have shape batch_size x seq_len
         self.pooling_mult = 1
@@ -328,7 +328,7 @@ class TFFunnelAttentionStructure:
         return tf.squeeze(tensor, 2) if ndim == 2 else tensor
 
     def pre_attention_pooling(self, output, attention_inputs):
-        """ Pool `output` and the proper parts of `attention_inputs` before the attention layer. """
+        """Pool `output` and the proper parts of `attention_inputs` before the attention layer."""
         position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
         if self.pool_q_only:
             if self.attention_type == "factorized":
@@ -348,7 +348,7 @@ class TFFunnelAttentionStructure:
         return output, attention_inputs
 
     def post_attention_pooling(self, attention_inputs):
-        """ Pool the proper parts of `attention_inputs` after the attention layer. """
+        """Pool the proper parts of `attention_inputs` after the attention layer."""
         position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
         if self.pool_q_only:
             self.pooling_mult *= 2
@@ -424,7 +424,7 @@ class TFFunnelRelMultiheadAttention(tf.keras.layers.Layer):
         super().build(input_shape)
 
     def relative_positional_attention(self, position_embeds, q_head, context_len, cls_mask=None):
-        """ Relative attention score for the positional encodings """
+        """Relative attention score for the positional encodings"""
         # q_head has shape batch_size x sea_len x n_head x d_head
         if self.attention_type == "factorized":
             # Notations from the paper, appending A.2.2, final formula (https://arxiv.org/abs/2006.03236)
@@ -470,7 +470,7 @@ class TFFunnelRelMultiheadAttention(tf.keras.layers.Layer):
         return positional_attn
 
     def relative_token_type_attention(self, token_type_mat, q_head, cls_mask=None):
-        """ Relative attention score for the token_type_ids """
+        """Relative attention score for the token_type_ids"""
         if token_type_mat is None:
             return 0
         batch_size, seq_len, context_len = shape_list(token_type_mat)
@@ -723,7 +723,7 @@ class TFFunnelDecoder(tf.keras.layers.Layer):
 
 @keras_serializable
 class TFFunnelBaseLayer(tf.keras.layers.Layer):
-    """ Base model without decoder """
+    """Base model without decoder"""
 
     config_class = FunnelConfig
 
@@ -807,7 +807,7 @@ class TFFunnelBaseLayer(tf.keras.layers.Layer):
 
 @keras_serializable
 class TFFunnelMainLayer(tf.keras.layers.Layer):
-    """ Base model with decoder """
+    """Base model with decoder"""
 
     config_class = FunnelConfig
 
diff --git a/src/transformers/models/gpt2/tokenization_gpt2.py b/src/transformers/models/gpt2/tokenization_gpt2.py
index e27ad9d3c0..8bf15c8ac6 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2.py
@@ -242,7 +242,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
         return word
 
     def _tokenize(self, text):
-        """ Tokenize a string. """
+        """Tokenize a string."""
         bpe_tokens = []
         for token in re.findall(self.pat, text):
             token = "".join(
@@ -252,7 +252,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
         return bpe_tokens
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         return self.encoder.get(token, self.encoder.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
@@ -260,7 +260,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
         return self.decoder.get(index)
 
     def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
+        """Converts a sequence of tokens (string) in a single string."""
         text = "".join(tokens)
         text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
         return text
diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py
index 382577a9f0..3c72c2a17e 100644
--- a/src/transformers/models/ibert/modeling_ibert.py
+++ b/src/transformers/models/ibert/modeling_ibert.py
@@ -645,7 +645,7 @@ class IBertPreTrainedModel(PreTrainedModel):
     base_model_prefix = "ibert"
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, (QuantLinear, nn.Linear)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index bce2ddd275..c8c3955579 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -611,7 +611,7 @@ class LayoutLMPreTrainedModel(PreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, nn.Linear):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index b245c3250b..89c436458b 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -509,7 +509,7 @@ class LEDEncoderSelfAttention(nn.Module):
 
     @staticmethod
     def _get_global_attn_indices(is_index_global_attn):
-        """ compute global attn indices required throughout forward pass """
+        """compute global attn indices required throughout forward pass"""
         # helper variable
         num_global_attn_indices = is_index_global_attn.long().sum(dim=1)
 
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index 8197a8ad80..23f27f490a 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -670,7 +670,7 @@ class TFLEDEncoderSelfAttention(tf.keras.layers.Layer):
 
     @staticmethod
     def _get_global_attn_indices(is_index_global_attn):
-        """ compute global attn indices required throughout forward pass """
+        """compute global attn indices required throughout forward pass"""
         # helper variable
         num_global_attn_indices = tf.math.count_nonzero(is_index_global_attn, axis=1)
         num_global_attn_indices = tf.cast(num_global_attn_indices, dtype=tf.constant(1).dtype)
diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 65634ca314..df95b3b869 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -899,7 +899,7 @@ class LongformerSelfAttention(nn.Module):
 
     @staticmethod
     def _get_global_attn_indices(is_index_global_attn):
-        """ compute global attn indices required throughout forward pass """
+        """compute global attn indices required throughout forward pass"""
         # helper variable
         num_global_attn_indices = is_index_global_attn.long().sum(dim=1)
 
@@ -1363,7 +1363,7 @@ class LongformerPreTrainedModel(PreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, nn.Linear):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index 6d5f769283..c9d66e408f 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -1189,7 +1189,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
 
     @staticmethod
     def _get_global_attn_indices(is_index_global_attn):
-        """ compute global attn indices required throughout forward pass """
+        """compute global attn indices required throughout forward pass"""
         # helper variable
         num_global_attn_indices = tf.math.count_nonzero(is_index_global_attn, axis=1)
         num_global_attn_indices = tf.cast(num_global_attn_indices, dtype=tf.constant(1).dtype)
diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py
index 7610d5c0c5..cc7c22fe9b 100644
--- a/src/transformers/models/lxmert/modeling_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_lxmert.py
@@ -783,7 +783,7 @@ class LxmertPreTrainedModel(PreTrainedModel):
     base_model_prefix = "lxmert"
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, nn.Linear):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py
index 613b385b77..13453f0b58 100644
--- a/src/transformers/models/marian/tokenization_marian.py
+++ b/src/transformers/models/marian/tokenization_marian.py
@@ -227,7 +227,7 @@ class MarianTokenizer(PreTrainedTokenizer):
         return super().decode(token_ids, **kwargs)
 
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        """Uses source spm if _decode_use_source_tokenizer is True, and target spm otherwise """
+        """Uses source spm if _decode_use_source_tokenizer is True, and target spm otherwise"""
         if self._decode_use_source_tokenizer:
             return self.spm_source.DecodePieces(tokens)
         else:
diff --git a/src/transformers/models/mbart/tokenization_mbart50.py b/src/transformers/models/mbart/tokenization_mbart50.py
index 48fdfe7772..ef7ec88f24 100644
--- a/src/transformers/models/mbart/tokenization_mbart50.py
+++ b/src/transformers/models/mbart/tokenization_mbart50.py
@@ -189,7 +189,7 @@ class MBart50Tokenizer(PreTrainedTokenizer):
         return self.sp_model.EncodeAsPieces(text)
 
     def _convert_token_to_id(self, token: str) -> int:
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         if token in self.fairseq_tokens_to_ids:
             return self.fairseq_tokens_to_ids[token]
         spm_id = self.sp_model.PieceToId(token)
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index ce4ece3d32..49969c06b8 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -708,7 +708,7 @@ class MegatronBertPreTrainedModel(PreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, (nn.Linear, nn.Embedding)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py
index bd3f86d21e..74fc6326b1 100644
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -669,7 +669,7 @@ class MobileBertPreTrainedModel(PreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, nn.Linear):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py
index e64d4de30b..f1327a8719 100644
--- a/src/transformers/models/mpnet/modeling_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_mpnet.py
@@ -56,7 +56,7 @@ class MPNetPreTrainedModel(PreTrainedModel):
     base_model_prefix = "mpnet"
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, nn.Linear):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py
index b9362bd625..8a2dcf36f2 100644
--- a/src/transformers/models/mpnet/modeling_tf_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py
@@ -430,7 +430,7 @@ class TFMPNetEncoder(tf.keras.layers.Layer):
         return ret
 
     def compute_position_bias(self, x, position_ids=None):
-        """ Compute binned relative position bias """
+        """Compute binned relative position bias"""
         input_shape = shape_list(x)
         qlen, klen = input_shape[1], input_shape[1]
 
diff --git a/src/transformers/models/mpnet/tokenization_mpnet.py b/src/transformers/models/mpnet/tokenization_mpnet.py
index 8041ec4ec5..98af763ade 100644
--- a/src/transformers/models/mpnet/tokenization_mpnet.py
+++ b/src/transformers/models/mpnet/tokenization_mpnet.py
@@ -210,7 +210,7 @@ class MPNetTokenizer(PreTrainedTokenizer):
         return split_tokens
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         return self.vocab.get(token, self.vocab.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
@@ -218,7 +218,7 @@ class MPNetTokenizer(PreTrainedTokenizer):
         return self.ids_to_tokens.get(index, self.unk_token)
 
     def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
+        """Converts a sequence of tokens (string) in a single string."""
         out_string = " ".join(tokens).replace(" ##", "").strip()
         return out_string
 
diff --git a/src/transformers/models/openai/tokenization_openai.py b/src/transformers/models/openai/tokenization_openai.py
index 92d4286c60..e5bc6b245f 100644
--- a/src/transformers/models/openai/tokenization_openai.py
+++ b/src/transformers/models/openai/tokenization_openai.py
@@ -176,7 +176,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
         return word
 
     def _tokenize(self, text):
-        """ Tokenize a string. """
+        """Tokenize a string."""
         split_tokens = []
         if self.fix_text is None:
             # Using BERT's BasicTokenizer
@@ -191,7 +191,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
         return split_tokens
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         return self.encoder.get(token, self.encoder.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
@@ -199,7 +199,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
         return self.decoder.get(index, self.unk_token)
 
     def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
+        """Converts a sequence of tokens (string) in a single string."""
         out_string = "".join(tokens).replace("</w>", " ").strip()
         return out_string
 
diff --git a/src/transformers/models/pegasus/tokenization_pegasus.py b/src/transformers/models/pegasus/tokenization_pegasus.py
index 472ca424bb..7ced567254 100644
--- a/src/transformers/models/pegasus/tokenization_pegasus.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus.py
@@ -175,7 +175,7 @@ class PegasusTokenizer(PreTrainedTokenizer):
         return pieces
 
     def _convert_token_to_id(self, token: str) -> int:
-        """ Converts a token (str) to an id using the vocab. """
+        """Converts a token (str) to an id using the vocab."""
         if token in self.decoder:
             return self.decoder[token]
         elif token in self.added_tokens_decoder:
@@ -194,7 +194,7 @@ class PegasusTokenizer(PreTrainedTokenizer):
         return token
 
     def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
+        """Converts a sequence of tokens (string) in a single string."""
         out_string = self.sp_model.decode_pieces(tokens)
         return out_string
 
diff --git a/src/transformers/models/phobert/tokenization_phobert.py b/src/transformers/models/phobert/tokenization_phobert.py
index 3caca9012d..a07e5bba3a 100644
--- a/src/transformers/models/phobert/tokenization_phobert.py
+++ b/src/transformers/models/phobert/tokenization_phobert.py
@@ -295,7 +295,7 @@ class PhobertTokenizer(PreTrainedTokenizer):
         return split_tokens
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         return self.encoder.get(token, self.encoder.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
@@ -303,7 +303,7 @@ class PhobertTokenizer(PreTrainedTokenizer):
         return self.decoder.get(index, self.unk_token)
 
     def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
+        """Converts a sequence of tokens (string) in a single string."""
         out_string = " ".join(tokens).replace("@@ ", "").strip()
         return out_string
 
diff --git a/src/transformers/models/prophetnet/tokenization_prophetnet.py b/src/transformers/models/prophetnet/tokenization_prophetnet.py
index 25df78162e..56f26df0e4 100644
--- a/src/transformers/models/prophetnet/tokenization_prophetnet.py
+++ b/src/transformers/models/prophetnet/tokenization_prophetnet.py
@@ -172,7 +172,7 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
         return split_tokens
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         return self.vocab.get(token, self.vocab.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
@@ -180,7 +180,7 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
         return self.ids_to_tokens.get(index, self.unk_token)
 
     def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
+        """Converts a sequence of tokens (string) in a single string."""
         out_string = " ".join(tokens).replace(" ##", "").strip()
         return out_string
 
diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py
index 28f0fdd08e..3156c70827 100755
--- a/src/transformers/models/reformer/modeling_reformer.py
+++ b/src/transformers/models/reformer/modeling_reformer.py
@@ -1779,7 +1779,7 @@ class ReformerPreTrainedModel(PreTrainedModel):
         return dummy_inputs
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, AxialPositionEmbeddings):
             for weight in module.weights:
                 torch.nn.init.normal_(weight, std=self.config.axial_norm_std)
diff --git a/src/transformers/models/reformer/tokenization_reformer.py b/src/transformers/models/reformer/tokenization_reformer.py
index c933d0cbc7..535a93a31a 100644
--- a/src/transformers/models/reformer/tokenization_reformer.py
+++ b/src/transformers/models/reformer/tokenization_reformer.py
@@ -115,7 +115,7 @@ class ReformerTokenizer(PreTrainedTokenizer):
         return pieces
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         return self.sp_model.piece_to_id(token)
 
     def _convert_id_to_token(self, index):
@@ -125,7 +125,7 @@ class ReformerTokenizer(PreTrainedTokenizer):
         return token
 
     def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
+        """Converts a sequence of tokens (string) in a single string."""
         out_string = self.sp_model.decode_pieces(tokens)
         return out_string
 
diff --git a/src/transformers/models/retribert/modeling_retribert.py b/src/transformers/models/retribert/modeling_retribert.py
index 0b6023e7bc..2507688209 100644
--- a/src/transformers/models/retribert/modeling_retribert.py
+++ b/src/transformers/models/retribert/modeling_retribert.py
@@ -50,7 +50,7 @@ class RetriBertPreTrainedModel(PreTrainedModel):
     base_model_prefix = "retribert"
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, nn.Linear):
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
             if module.bias is not None:
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index f7a73b336c..274833d050 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -574,7 +574,7 @@ class RobertaPreTrainedModel(PreTrainedModel):
 
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, nn.Linear):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py
index 09dcd680bb..ce7d18808d 100644
--- a/src/transformers/models/squeezebert/modeling_squeezebert.py
+++ b/src/transformers/models/squeezebert/modeling_squeezebert.py
@@ -431,7 +431,7 @@ class SqueezeBertPreTrainedModel(PreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, (nn.Linear, nn.Conv1d)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 013f291c5b..0474ef5ef6 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -389,7 +389,7 @@ class T5Attention(nn.Module):
         return relative_buckets
 
     def compute_bias(self, query_length, key_length):
-        """ Compute binned relative position bias """
+        """Compute binned relative position bias"""
         context_position = torch.arange(query_length, dtype=torch.long)[:, None]
         memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
         relative_position = memory_position - context_position  # shape (query_length, key_length)
@@ -436,15 +436,15 @@ class T5Attention(nn.Module):
         key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
 
         def shape(states):
-            """  projection """
+            """projection"""
             return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
 
         def unshape(states):
-            """  reshape """
+            """reshape"""
             return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
 
         def project(hidden_states, proj_layer, key_value_states, past_key_value):
-            """ projects hidden states correctly to key/query states """
+            """projects hidden states correctly to key/query states"""
             if key_value_states is None:
                 # self-attn
                 # (batch_size, n_heads, seq_length, dim_per_head)
@@ -718,7 +718,7 @@ class T5PreTrainedModel(PreTrainedModel):
         return dummy_inputs
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         factor = self.config.initializer_factor  # Used for testing weights initialization
         if isinstance(module, T5LayerNorm):
             module.weight.data.fill_(factor * 1.0)
diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py
index d1a9f0aaca..99be003d07 100644
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -80,7 +80,7 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
         self.variance_epsilon = epsilon
 
     def build(self, input_shape):
-        """Build shared word embedding layer """
+        """Build shared word embedding layer"""
         self.weight = self.add_weight("weight", shape=(input_shape[-1],), initializer="ones")
         super().build(input_shape)
 
@@ -230,7 +230,7 @@ class TFT5Attention(tf.keras.layers.Layer):
         return relative_buckets
 
     def compute_bias(self, query_length, key_length):
-        """ Compute binned relative position bias """
+        """Compute binned relative position bias"""
         context_position = tf.range(query_length)[:, None]
         memory_position = tf.range(key_length)[None, :]
         relative_position = memory_position - context_position  # shape (query_length, key_length)
@@ -279,17 +279,17 @@ class TFT5Attention(tf.keras.layers.Layer):
         key_length = real_seq_length if key_value_states is None else shape_list(key_value_states)[1]
 
         def shape(hidden_states):
-            """  projection """
+            """projection"""
             return tf.transpose(
                 tf.reshape(hidden_states, (batch_size, -1, self.n_heads, self.key_value_proj_dim)), perm=(0, 2, 1, 3)
             )
 
         def unshape(hidden_states):
-            """  compute context """
+            """compute context"""
             return tf.reshape(tf.transpose(hidden_states, perm=(0, 2, 1, 3)), (batch_size, -1, self.inner_dim))
 
         def project(hidden_states, proj_layer, key_value_states, past_key_value):
-            """ projects hidden states correctly to key/query states """
+            """projects hidden states correctly to key/query states"""
             if key_value_states is None:
                 # self-attn
                 # (batch_size, n_heads, seq_length, dim_per_head)
diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py
index 90a0159aef..a069cf4488 100644
--- a/src/transformers/models/t5/tokenization_t5.py
+++ b/src/transformers/models/t5/tokenization_t5.py
@@ -243,7 +243,7 @@ class T5Tokenizer(PreTrainedTokenizer):
         return pieces
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         if token.startswith("<extra_id_"):
             match = re.match(r"<extra_id_(\d+)>", token)
             num = int(match.group(1))
@@ -259,7 +259,7 @@ class T5Tokenizer(PreTrainedTokenizer):
         return token
 
     def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
+        """Converts a sequence of tokens (string) in a single string."""
         current_sub_tokens = []
         out_string = ""
         for token in tokens:
diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py
index 5bfca58596..39c97ccaa1 100644
--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -699,7 +699,7 @@ class TapasPreTrainedModel(PreTrainedModel):
 
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, nn.Linear):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index 3d1e82ac51..b509c4a4c4 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -374,7 +374,7 @@ class TapasTokenizer(PreTrainedTokenizer):
         return split_tokens
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         return self.vocab.get(token, self.vocab.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
@@ -382,7 +382,7 @@ class TapasTokenizer(PreTrainedTokenizer):
         return self.ids_to_tokens.get(index, self.unk_token)
 
     def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
+        """Converts a sequence of tokens (string) in a single string."""
         out_string = " ".join(tokens).replace(" ##", "").strip()
         return out_string
 
diff --git a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
index 9b185ecdd1..fb7e70ee21 100644
--- a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
@@ -434,7 +434,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
         return self.idx2sym[idx]
 
     def _convert_token_to_id(self, sym):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         if sym in self.sym2idx:
             return self.sym2idx[sym]
         else:
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index 559dfff83c..3584813db6 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -372,7 +372,7 @@ class ViTPreTrainedModel(PreTrainedModel):
     base_model_prefix = "vit"
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, (nn.Linear, nn.Conv2d)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index ba548dc3d8..4c9d56254c 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -680,7 +680,7 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, nn.Linear):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py
index 6bac6f597c..0ae3ac2a24 100644
--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@@ -151,11 +151,11 @@ class TFXLMMultiHeadAttention(tf.keras.layers.Layer):
         mask_reshape = (bs, 1, qlen, klen) if len(shape_list(mask)) == 3 else (bs, 1, 1, klen)
 
         def shape(x):
-            """  projection """
+            """projection"""
             return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
 
         def unshape(x):
-            """  compute context """
+            """compute context"""
             return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
 
         q = shape(self.q_lin(input))  # (bs, n_heads, qlen, dim_per_head)
diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py
index a4a6c0dd08..55ff28b86e 100755
--- a/src/transformers/models/xlm/modeling_xlm.py
+++ b/src/transformers/models/xlm/modeling_xlm.py
@@ -159,11 +159,11 @@ class MultiHeadAttention(nn.Module):
         mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen)
 
         def shape(x):
-            """  projection """
+            """projection"""
             return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
 
         def unshape(x):
-            """  compute context """
+            """compute context"""
             return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
 
         q = shape(self.q_lin(input))  # (bs, n_heads, qlen, dim_per_head)
@@ -251,7 +251,7 @@ class XLMPreTrainedModel(PreTrainedModel):
         return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list}
 
     def _init_weights(self, module):
-        """ Initialize the weights. """
+        """Initialize the weights."""
         if isinstance(module, nn.Embedding):
             if self.config is not None and self.config.embed_init_std is not None:
                 nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std)
diff --git a/src/transformers/models/xlm/tokenization_xlm.py b/src/transformers/models/xlm/tokenization_xlm.py
index 95730451fd..6c102fdbbe 100644
--- a/src/transformers/models/xlm/tokenization_xlm.py
+++ b/src/transformers/models/xlm/tokenization_xlm.py
@@ -847,7 +847,7 @@ class XLMTokenizer(PreTrainedTokenizer):
         return split_tokens
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         return self.encoder.get(token, self.encoder.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
@@ -855,7 +855,7 @@ class XLMTokenizer(PreTrainedTokenizer):
         return self.decoder.get(index, self.unk_token)
 
     def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
+        """Converts a sequence of tokens (string) in a single string."""
         out_string = "".join(tokens).replace("</w>", " ").strip()
         return out_string
 
diff --git a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
index 188292ed17..9c2d90914a 100644
--- a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
@@ -245,7 +245,7 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
         return self.sp_model.EncodeAsPieces(text)
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         if token in self.fairseq_tokens_to_ids:
             return self.fairseq_tokens_to_ids[token]
         spm_id = self.sp_model.PieceToId(token)
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
index 877bfaf1d1..cda78e900d 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
@@ -270,7 +270,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
         return self.sp_model.encode(text, out_type=str)
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         if token in self.fairseq_tokens_to_ids:
             return self.fairseq_tokens_to_ids[token]
         spm_id = self.sp_model.PieceToId(token)
diff --git a/src/transformers/models/xlnet/tokenization_xlnet.py b/src/transformers/models/xlnet/tokenization_xlnet.py
index 97fd542c28..5137bcfee3 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet.py
@@ -189,7 +189,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         return outputs
 
     def _tokenize(self, text, sample=False):
-        """ Tokenize a string. """
+        """Tokenize a string."""
         text = self.preprocess_text(text)
 
         if not sample:
@@ -213,7 +213,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         return new_pieces
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         return self.sp_model.PieceToId(token)
 
     def _convert_id_to_token(self, index):
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 2292acb662..4144be2eb9 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -389,7 +389,7 @@ if is_tf_available():
 
 
 def require_torch_gpu(test_case):
-    """Decorator marking a test that requires CUDA and PyTorch. """
+    """Decorator marking a test that requires CUDA and PyTorch."""
     if torch_device != "cuda":
         return unittest.skip("test requires CUDA")(test_case)
     else:
@@ -593,14 +593,14 @@ class CaptureStd:
 
 
 class CaptureStdout(CaptureStd):
-    """ Same as CaptureStd but captures only stdout """
+    """Same as CaptureStd but captures only stdout"""
 
     def __init__(self):
         super().__init__(err=False)
 
 
 class CaptureStderr(CaptureStd):
-    """ Same as CaptureStd but captures only stderr """
+    """Same as CaptureStd but captures only stderr"""
 
     def __init__(self):
         super().__init__(out=False)
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index e72b889767..fb69674081 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -88,7 +88,7 @@ else:
 
     @dataclass
     class EncodingFast:
-        """ This is dummy class because without the `tokenizers` library we don't have these objects anyway """
+        """This is dummy class because without the `tokenizers` library we don't have these objects anyway"""
 
         pass
 
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index d53ad9ac44..e97d999df7 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -805,7 +805,7 @@ class Trainer:
         return len(dataloader.dataset)
 
     def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
-        """ HP search setup code """
+        """HP search setup code"""
         self._trial = trial
 
         if self.hp_search_backend is None or trial is None:
diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index e760ab55c1..c699e33d46 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -92,14 +92,14 @@ class TrainerState:
             self.log_history = []
 
     def save_to_json(self, json_path: str):
-        """ Save the content of this instance in JSON format inside :obj:`json_path`."""
+        """Save the content of this instance in JSON format inside :obj:`json_path`."""
         json_string = json.dumps(dataclasses.asdict(self), indent=2, sort_keys=True) + "\n"
         with open(json_path, "w", encoding="utf-8") as f:
             f.write(json_string)
 
     @classmethod
     def load_from_json(cls, json_path: str):
-        """ Create an instance from the content of :obj:`json_path`."""
+        """Create an instance from the content of :obj:`json_path`."""
         with open(json_path, "r", encoding="utf-8") as f:
             text = f.read()
         return cls(**json.loads(text))
@@ -141,15 +141,15 @@ class TrainerControl:
     should_log: bool = False
 
     def _new_training(self):
-        """ Internal method that resets the variable for a new training. """
+        """Internal method that resets the variable for a new training."""
         self.should_training_stop = False
 
     def _new_epoch(self):
-        """ Internal method that resets the variable for a new epoch. """
+        """Internal method that resets the variable for a new epoch."""
         self.should_epoch_stop = False
 
     def _new_step(self):
-        """ Internal method that resets the variable for a new step. """
+        """Internal method that resets the variable for a new step."""
         self.should_save = False
         self.should_evaluate = False
         self.should_log = False
@@ -275,7 +275,7 @@ class TrainerCallback:
 
 
 class CallbackHandler(TrainerCallback):
-    """ Internal class that just calls the list of callbacks in order. """
+    """Internal class that just calls the list of callbacks in order."""
 
     def __init__(self, callbacks, model, tokenizer, optimizer, lr_scheduler):
         self.callbacks = []
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index 0b58904c00..7ae962155c 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -294,14 +294,14 @@ def get_tpu_sampler(dataset: torch.utils.data.dataset.Dataset, bach_size: int):
 
 
 def nested_new_like(arrays, num_samples, padding_index=-100):
-    """ Create the same nested structure as `arrays` with a first dimension always at `num_samples`."""
+    """Create the same nested structure as `arrays` with a first dimension always at `num_samples`."""
     if isinstance(arrays, (list, tuple)):
         return type(arrays)(nested_new_like(x, num_samples) for x in arrays)
     return np.full_like(arrays, padding_index, shape=(num_samples, *arrays.shape[1:]))
 
 
 def expand_like(arrays, new_seq_length, padding_index=-100):
-    """ Expand the `arrays` so that the second dimension grows to `new_seq_length`. Uses `padding_index` for padding."""
+    """Expand the `arrays` so that the second dimension grows to `new_seq_length`. Uses `padding_index` for padding."""
     result = np.full_like(arrays, padding_index, shape=(arrays.shape[0], new_seq_length) + arrays.shape[2:])
     result[:, : arrays.shape[1]] = arrays
     return result
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index 53d2cf7f15..7a2bfedf82 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -320,7 +320,7 @@ class TrainerMemoryTracker:
         self.init_reported = False
 
     def derive_stage(self):
-        """ derives the stage/caller name automatically """
+        """derives the stage/caller name automatically"""
         caller = inspect.currentframe().f_back.f_back.f_code.co_name
         if caller in self.stages:
             return self.stages[caller]
@@ -330,7 +330,7 @@ class TrainerMemoryTracker:
             )
 
     def cpu_mem_used(self):
-        """ get resident set size memory for the current process """
+        """get resident set size memory for the current process"""
         return self.process.memory_info().rss
 
     def peak_monitor_func(self):
@@ -346,7 +346,7 @@ class TrainerMemoryTracker:
                 break
 
     def start(self):
-        """ start tracking for the caller's stage """
+        """start tracking for the caller's stage"""
         if self.skip_memory_metrics:
             return
 
@@ -376,7 +376,7 @@ class TrainerMemoryTracker:
         peak_monitor_thread.start()
 
     def stop(self, stage):
-        """ stop tracking for the passed stage """
+        """stop tracking for the passed stage"""
 
         # deal with nested calls of eval during train - simply ignore those
         if self.cur_stage is not None and self.cur_stage != stage:
@@ -416,7 +416,7 @@ class TrainerMemoryTracker:
         self.cur_stage = None
 
     def update_metrics(self, stage, metrics):
-        """ stop tracking for the passed stage """
+        """stop tracking for the passed stage"""
         if self.skip_memory_metrics:
             return
 
@@ -438,7 +438,7 @@ class TrainerMemoryTracker:
                     metrics[f"{stage}_mem_gpu_{t}_delta"] = self.gpu[stage][t]
 
     def stop_and_update_metrics(self, metrics=None):
-        """ combine stop + update in one call for simpler code """
+        """combine stop + update in one call for simpler code"""
         if self.skip_memory_metrics:
             return
 
diff --git a/src/transformers/utils/versions.py b/src/transformers/utils/versions.py
index 73151487bc..36125d8681 100644
--- a/src/transformers/utils/versions.py
+++ b/src/transformers/utils/versions.py
@@ -115,12 +115,12 @@ def require_version(requirement: str, hint: Optional[str] = None) -> None:
 
 
 def require_version_core(requirement):
-    """ require_version wrapper which emits a core-specific hint on failure """
+    """require_version wrapper which emits a core-specific hint on failure"""
     hint = "Try: pip install transformers -U or pip install -e '.[dev]' if you're working with git master"
     return require_version(requirement, hint)
 
 
 def require_version_examples(requirement):
-    """ require_version wrapper which emits examples-specific hint on failure """
+    """require_version wrapper which emits examples-specific hint on failure"""
     hint = "Try: pip install -r examples/requirements.txt"
     return require_version(requirement, hint)
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 9868966a5a..07afadc369 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -122,7 +122,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
         transformers.integrations._is_deepspeed_zero3_enabled = None
 
     def get_config_dict(self, stage):
-        """ As the tests modify the dict, always make a copy """
+        """As the tests modify the dict, always make a copy"""
         config = deepcopy(self.ds_config_dict[stage])
         if stage == ZERO3:
             # This setting slows things down, so don't enable it by default unless needed by a test.
@@ -430,7 +430,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
 @require_deepspeed
 @require_torch_gpu
 class TestDeepSpeedWithLauncher(TestCasePlus):
-    """ This class is for testing via an external script - can do multiple gpus """
+    """This class is for testing via an external script - can do multiple gpus"""
 
     # Tests to devise #
     #
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index b82a8c5664..d93faa1f6c 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -1122,7 +1122,7 @@ class ModelTesterMixin:
 
         # a candidate for testing_utils
         def get_current_gpu_memory_use():
-            """ returns a list of cuda memory allocations per GPU in MBs"""
+            """returns a list of cuda memory allocations per GPU in MBs"""
 
             per_device_memory = []
             for id in range(torch.cuda.device_count()):
diff --git a/tests/test_modeling_funnel.py b/tests/test_modeling_funnel.py
index 4435359eb6..2d59e9f4e4 100644
--- a/tests/test_modeling_funnel.py
+++ b/tests/test_modeling_funnel.py
@@ -42,7 +42,7 @@ if is_torch_available():
 
 
 class FunnelModelTester:
-    """You can also import this e.g, from .test_modeling_funnel import FunnelModelTester """
+    """You can also import this e.g, from .test_modeling_funnel import FunnelModelTester"""
 
     def __init__(
         self,
diff --git a/tests/test_modeling_layoutlm.py b/tests/test_modeling_layoutlm.py
index d26bf91cbd..a62d13e8fc 100644
--- a/tests/test_modeling_layoutlm.py
+++ b/tests/test_modeling_layoutlm.py
@@ -36,7 +36,7 @@ if is_torch_available():
 
 
 class LayoutLMModelTester:
-    """You can also import this e.g from .test_modeling_layoutlm import LayoutLMModelTester """
+    """You can also import this e.g from .test_modeling_layoutlm import LayoutLMModelTester"""
 
     def __init__(
         self,
diff --git a/tests/test_modeling_lxmert.py b/tests/test_modeling_lxmert.py
index b03cc31335..451db8089a 100644
--- a/tests/test_modeling_lxmert.py
+++ b/tests/test_modeling_lxmert.py
@@ -40,7 +40,7 @@ if is_torch_available():
 
 
 class LxmertModelTester:
-    """You can also import this e.g from .test_modeling_bart import BartModelTester """
+    """You can also import this e.g from .test_modeling_bart import BartModelTester"""
 
     def __init__(
         self,
diff --git a/tests/test_modeling_tapas.py b/tests/test_modeling_tapas.py
index b36147d558..40bdba0e70 100644
--- a/tests/test_modeling_tapas.py
+++ b/tests/test_modeling_tapas.py
@@ -63,7 +63,7 @@ if is_torch_available():
 
 
 class TapasModelTester:
-    """You can also import this e.g from .test_modeling_tapas import TapasModelTester """
+    """You can also import this e.g from .test_modeling_tapas import TapasModelTester"""
 
     def __init__(
         self,
diff --git a/tests/test_modeling_tf_funnel.py b/tests/test_modeling_tf_funnel.py
index dc13ed725c..93a811f24b 100644
--- a/tests/test_modeling_tf_funnel.py
+++ b/tests/test_modeling_tf_funnel.py
@@ -39,7 +39,7 @@ if is_tf_available():
 
 
 class TFFunnelModelTester:
-    """You can also import this e.g, from .test_modeling_funnel import FunnelModelTester """
+    """You can also import this e.g, from .test_modeling_funnel import FunnelModelTester"""
 
     def __init__(
         self,
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index febb9a05c0..ac596a42c0 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -58,7 +58,7 @@ NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilin
 
 
 def filter_non_english(_, pretrained_name: str):
-    """ Filter all the model for non-english language """
+    """Filter all the model for non-english language"""
     return not any([lang in pretrained_name for lang in NON_ENGLISH_TAGS])
 
 
diff --git a/tests/test_tokenization_fsmt.py b/tests/test_tokenization_fsmt.py
index 8675cc0ffb..276941f594 100644
--- a/tests/test_tokenization_fsmt.py
+++ b/tests/test_tokenization_fsmt.py
@@ -100,7 +100,7 @@ class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertEqual(tokenizer.tgt_vocab_size, 21)
 
     def test_full_tokenizer(self):
-        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+        """Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt"""
         tokenizer = FSMTTokenizer(self.langs, self.src_vocab_file, self.tgt_vocab_file, self.merges_file)
 
         text = "lower"
diff --git a/tests/test_tokenization_layoutlm.py b/tests/test_tokenization_layoutlm.py
index 89aac1355c..79831cd30c 100644
--- a/tests/test_tokenization_layoutlm.py
+++ b/tests/test_tokenization_layoutlm.py
@@ -70,5 +70,5 @@ class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 
     def test_special_tokens_as_you_expect(self):
-        """If you are training a seq2seq model that expects a decoder_prefix token make sure it is prepended to decoder_input_ids """
+        """If you are training a seq2seq model that expects a decoder_prefix token make sure it is prepended to decoder_input_ids"""
         pass
diff --git a/tests/test_tokenization_xlm.py b/tests/test_tokenization_xlm.py
index e39426e850..cf0296ddd9 100644
--- a/tests/test_tokenization_xlm.py
+++ b/tests/test_tokenization_xlm.py
@@ -72,7 +72,7 @@ class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         return input_text, output_text
 
     def test_full_tokenizer(self):
-        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+        """Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt"""
         tokenizer = XLMTokenizer(self.vocab_file, self.merges_file)
 
         text = "lower"
diff --git a/utils/check_copies.py b/utils/check_copies.py
index 3d6ef7adbd..db1999d224 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -33,7 +33,7 @@ def _should_continue(line, indent):
 
 
 def find_code_in_transformers(object_name):
-    """ Find and return the code source code of `object_name`."""
+    """Find and return the code source code of `object_name`."""
     parts = object_name.split(".")
     i = 0
 
@@ -193,7 +193,7 @@ def check_copies(overwrite: bool = False):
 
 
 def get_model_list():
-    """ Extracts the model list from the README. """
+    """Extracts the model list from the README."""
     # If the introduction or the conclusion of the list change, the prompts may need to be updated.
     _start_prompt = "🤗 Transformers currently provides the following architectures"
     _end_prompt = "1. Want to contribute a new model?"
@@ -224,7 +224,7 @@ def get_model_list():
 
 
 def split_long_line_with_indent(line, max_per_line, indent):
-    """ Split the `line` so that it doesn't go over `max_per_line` and adds `indent` to new lines. """
+    """Split the `line` so that it doesn't go over `max_per_line` and adds `indent` to new lines."""
     words = line.split(" ")
     lines = []
     current_line = words[0]
@@ -239,7 +239,7 @@ def split_long_line_with_indent(line, max_per_line, indent):
 
 
 def convert_to_rst(model_list, max_per_line=None):
-    """ Convert `model_list` to rst format. """
+    """Convert `model_list` to rst format."""
     # Convert **[description](link)** to `description <link>`__
     def _rep_link(match):
         title, link = match.groups()
@@ -298,7 +298,7 @@ def _find_text_in_file(filename, start_prompt, end_prompt):
 
 
 def check_model_list_copy(overwrite=False, max_per_line=119):
-    """ Check the model lists in the README and index.rst are consistent and maybe `overwrite`. """
+    """Check the model lists in the README and index.rst are consistent and maybe `overwrite`."""
     rst_list, start_index, end_index, lines = _find_text_in_file(
         filename=os.path.join(PATH_TO_DOCS, "index.rst"),
         start_prompt="    This list is updated automatically from the README",
diff --git a/utils/check_dummies.py b/utils/check_dummies.py
index 89965f9784..fb71ea1536 100644
--- a/utils/check_dummies.py
+++ b/utils/check_dummies.py
@@ -65,7 +65,7 @@ def find_backend(line):
 
 
 def read_init():
-    """ Read the init and extracts PyTorch, TensorFlow, SentencePiece and Tokenizers objects. """
+    """Read the init and extracts PyTorch, TensorFlow, SentencePiece and Tokenizers objects."""
     with open(os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"), "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
 
@@ -101,7 +101,7 @@ def read_init():
 
 
 def create_dummy_object(name, backend_name):
-    """ Create the code for the dummy object corresponding to `name`."""
+    """Create the code for the dummy object corresponding to `name`."""
     _pretrained = [
         "Config" "ForCausalLM",
         "ForConditionalGeneration",
@@ -130,7 +130,7 @@ def create_dummy_object(name, backend_name):
 
 
 def create_dummy_files():
-    """ Create the content of the dummy files. """
+    """Create the content of the dummy files."""
     backend_specific_objects = read_init()
     # For special correspondence backend to module name as used in the function requires_modulename
     dummy_files = {}
@@ -146,7 +146,7 @@ def create_dummy_files():
 
 
 def check_dummies(overwrite=False):
-    """ Check if the dummy files are up to date and maybe `overwrite` with the right content. """
+    """Check if the dummy files are up to date and maybe `overwrite` with the right content."""
     dummy_files = create_dummy_files()
     # For special correspondence backend to shortcut as used in utils/dummy_xxx_objects.py
     short_names = {"torch": "pt"}
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 6f5fd8faf3..bd6c9af45b 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -119,7 +119,7 @@ transformers = spec.loader.load_module()
 # If some modeling modules should be ignored for all checks, they should be added in the nested list
 # _ignore_modules of this function.
 def get_model_modules():
-    """ Get the model modules inside the transformers library. """
+    """Get the model modules inside the transformers library."""
     _ignore_modules = [
         "modeling_auto",
         "modeling_encoder_decoder",
@@ -151,7 +151,7 @@ def get_model_modules():
 
 
 def get_models(module):
-    """ Get the objects in module that are models."""
+    """Get the objects in module that are models."""
     models = []
     model_classes = (transformers.PreTrainedModel, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel)
     for attr_name in dir(module):
@@ -166,7 +166,7 @@ def get_models(module):
 # If some test_modeling files should be ignored when checking models are all tested, they should be added in the
 # nested list _ignore_files of this function.
 def get_model_test_files():
-    """ Get the model test files."""
+    """Get the model test files."""
     _ignore_files = [
         "test_modeling_common",
         "test_modeling_encoder_decoder",
@@ -187,7 +187,7 @@ def get_model_test_files():
 # This is a bit hacky but I didn't find a way to import the test_file as a module and read inside the tester class
 # for the all_model_classes variable.
 def find_tested_models(test_file):
-    """ Parse the content of test_file to detect what's in all_model_classes"""
+    """Parse the content of test_file to detect what's in all_model_classes"""
     # This is a bit hacky but I didn't find a way to import the test_file as a module and read inside the class
     with open(os.path.join(PATH_TO_TESTS, test_file), "r", encoding="utf-8", newline="\n") as f:
         content = f.read()
@@ -205,7 +205,7 @@ def find_tested_models(test_file):
 
 
 def check_models_are_tested(module, test_file):
-    """ Check models defined in module are tested in test_file."""
+    """Check models defined in module are tested in test_file."""
     defined_models = get_models(module)
     tested_models = find_tested_models(test_file)
     if tested_models is None:
@@ -229,7 +229,7 @@ def check_models_are_tested(module, test_file):
 
 
 def check_all_models_are_tested():
-    """ Check all models are properly tested."""
+    """Check all models are properly tested."""
     modules = get_model_modules()
     test_files = get_model_test_files()
     failures = []
@@ -245,7 +245,7 @@ def check_all_models_are_tested():
 
 
 def get_all_auto_configured_models():
-    """ Return the list of all models in at least one auto class."""
+    """Return the list of all models in at least one auto class."""
     result = set()  # To avoid duplicates we concatenate all model classes in a set.
     for attr_name in dir(transformers.models.auto.modeling_auto):
         if attr_name.startswith("MODEL_") and attr_name.endswith("MAPPING"):
@@ -271,7 +271,7 @@ def ignore_unautoclassed(model_name):
 
 
 def check_models_are_auto_configured(module, all_auto_models):
-    """ Check models defined in module are each in an auto class."""
+    """Check models defined in module are each in an auto class."""
     defined_models = get_models(module)
     failures = []
     for model_name, _ in defined_models:
@@ -285,7 +285,7 @@ def check_models_are_auto_configured(module, all_auto_models):
 
 
 def check_all_models_are_auto_configured():
-    """ Check all models are each in an auto class."""
+    """Check all models are each in an auto class."""
     modules = get_model_modules()
     all_auto_models = get_all_auto_configured_models()
     failures = []
@@ -301,7 +301,7 @@ _re_decorator = re.compile(r"^\s*@(\S+)\s+$")
 
 
 def check_decorator_order(filename):
-    """ Check that in the test file `filename` the slow decorator is always last."""
+    """Check that in the test file `filename` the slow decorator is always last."""
     with open(filename, "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
     decorator_before = None
@@ -319,7 +319,7 @@ def check_decorator_order(filename):
 
 
 def check_all_decorator_order():
-    """ Check that in all test files, the slow decorator is always last."""
+    """Check that in all test files, the slow decorator is always last."""
     errors = []
     for fname in os.listdir(PATH_TO_TESTS):
         if fname.endswith(".py"):
@@ -334,7 +334,7 @@ def check_all_decorator_order():
 
 
 def find_all_documented_objects():
-    """ Parse the content of all doc files to detect which classes and functions it documents"""
+    """Parse the content of all doc files to detect which classes and functions it documents"""
     documented_obj = []
     for doc_file in Path(PATH_TO_DOC).glob("**/*.rst"):
         with open(doc_file, "r", encoding="utf-8", newline="\n") as f:
@@ -454,7 +454,7 @@ def ignore_undocumented(name):
 
 
 def check_all_objects_are_documented():
-    """ Check all models are properly documented."""
+    """Check all models are properly documented."""
     documented_objs = find_all_documented_objects()
     modules = transformers._modules
     objects = [c for c in dir(transformers) if c not in modules and not c.startswith("_")]
@@ -467,7 +467,7 @@ def check_all_objects_are_documented():
 
 
 def check_repo_quality():
-    """ Check all models are properly tested and documented."""
+    """Check all models are properly tested and documented."""
     print("Checking all models are properly tested.")
     check_all_decorator_order()
     check_all_models_are_tested()
diff --git a/utils/check_table.py b/utils/check_table.py
index b45daf46ed..9151040fc9 100644
--- a/utils/check_table.py
+++ b/utils/check_table.py
@@ -159,7 +159,7 @@ def get_model_table_from_auto_modules():
 
 
 def check_model_table(overwrite=False):
-    """ Check the model table in the index.rst is consistent with the state of the lib and maybe `overwrite`. """
+    """Check the model table in the index.rst is consistent with the state of the lib and maybe `overwrite`."""
     current_table, start_index, end_index, lines = _find_text_in_file(
         filename=os.path.join(PATH_TO_DOCS, "index.rst"),
         start_prompt="    This table is updated automatically from the auto module",
diff --git a/utils/style_doc.py b/utils/style_doc.py
index 4da4709912..82341a07c4 100644
--- a/utils/style_doc.py
+++ b/utils/style_doc.py
@@ -431,7 +431,7 @@ def _add_new_lines_before_doc_special_words(text):
 
 
 def style_rst_file(doc_file, max_len=119, check_only=False):
-    """ Style one rst file `doc_file` to `max_len`."""
+    """Style one rst file `doc_file` to `max_len`."""
     with open(doc_file, "r", encoding="utf-8", newline="\n") as f:
         doc = f.read()