From 969859d5f67c7106de4d1098c4891c9b03694bbe Mon Sep 17 00:00:00 2001
From: Santiago Castro <santi.1410@hotmail.com>
Date: Thu, 29 Oct 2020 10:33:33 -0400
Subject: [PATCH] Fix doc errors and typos across the board (#8139)

* Fix doc errors and typos across the board

* Fix a typo

* Fix the CI

* Fix more typos

* Fix CI

* More fixes

* Fix CI

* More fixes

* More fixes
---
 CONTRIBUTING.md                               |  4 +-
 docs/source/installation.md                   |  4 +-
 docs/source/migration.md                      |  4 +-
 docs/source/model_sharing.rst                 |  2 +-
 docs/source/task_summary.rst                  |  4 +-
 examples/adversarial/utils_hans.py            |  5 +-
 .../pabee/modeling_pabee_bert.py              |  2 +-
 examples/deebert/src/modeling_highway_bert.py |  4 +-
 examples/distillation/distiller.py            |  6 +--
 examples/distillation/lm_seqs_dataset.py      |  6 +--
 examples/distillation/scripts/extract.py      |  4 +-
 examples/lxmert/modeling_frcnn.py             | 12 ++---
 examples/lxmert/processing_image.py           |  2 +-
 examples/lxmert/utils.py                      |  4 +-
 .../movement-pruning/counts_parameters.py     |  2 +-
 .../emmental/modeling_bert_masked.py          |  4 +-
 examples/movement-pruning/masked_run_glue.py  |  6 +--
 examples/movement-pruning/masked_run_squad.py |  6 +--
 examples/rag/distributed_retriever.py         |  8 ++--
 examples/rag/eval_rag.py                      |  4 +-
 examples/rag/use_own_knowledge_dataset.py     |  2 +-
 .../seq2seq/bertabs/configuration_bertabs.py  |  4 +-
 examples/seq2seq/bertabs/modeling_bertabs.py  |  6 +--
 examples/seq2seq/convert_model_to_fp16.py     |  2 +-
 .../aubmindlab/bert-base-arabert/README.md    |  2 +-
 .../aubmindlab/bert-base-arabertv01/README.md |  2 +-
 .../cs224n-squad2.0-albert-large-v2/README.md |  2 +-
 .../jannesg/takalane_afr_roberta/README.md    |  2 +-
 model_cards/mrm8488/CodeBERTaPy/README.md     |  2 +-
 .../README.md                                 |  2 +-
 .../README.md                                 |  2 +-
 .../README.md                                 |  2 +-
 .../README.md                                 |  2 +-
 .../README.md                                 |  2 +-
 .../README.md                                 |  2 +-
 .../README.md                                 |  2 +-
 .../spanbert-base-finetuned-squadv1/README.md |  2 +-
 .../spanbert-base-finetuned-squadv2/README.md |  2 +-
 .../spanbert-base-finetuned-tacred/README.md  |  2 +-
 .../README.md                                 |  2 +-
 .../README.md                                 |  2 +-
 .../spanbert-large-finetuned-tacred/README.md |  2 +-
 .../README.md                                 |  2 +-
 .../t5-base-finetuned-wikiSQL/README.md       |  2 +-
 .../t5-small-finetuned-wikiSQL/README.md      |  2 +-
 .../xlm-multi-finetuned-xquadv1/README.md     |  2 +-
 src/transformers/benchmark/benchmark_utils.py | 17 +++----
 src/transformers/commands/convert.py          |  2 +-
 src/transformers/configuration_bart.py        |  2 +-
 src/transformers/configuration_bert.py        |  2 +-
 .../configuration_bert_generation.py          |  4 +-
 src/transformers/configuration_blenderbot.py  |  2 +-
 src/transformers/configuration_deberta.py     |  2 +-
 src/transformers/configuration_distilbert.py  |  2 +-
 src/transformers/configuration_dpr.py         |  2 +-
 src/transformers/configuration_electra.py     |  2 +-
 src/transformers/configuration_flaubert.py    |  4 +-
 src/transformers/configuration_fsmt.py        |  2 +-
 src/transformers/configuration_funnel.py      |  2 +-
 src/transformers/configuration_layoutlm.py    |  2 +-
 src/transformers/configuration_lxmert.py      | 11 ++---
 src/transformers/configuration_marian.py      |  2 +-
 src/transformers/configuration_mbart.py       |  2 +-
 src/transformers/configuration_pegasus.py     |  2 +-
 src/transformers/configuration_prophetnet.py  |  2 +-
 src/transformers/configuration_rag.py         |  4 +-
 src/transformers/configuration_reformer.py    |  2 +-
 src/transformers/configuration_retribert.py   |  4 +-
 src/transformers/configuration_squeezebert.py |  2 +-
 src/transformers/configuration_transfo_xl.py  |  2 +-
 src/transformers/convert_graph_to_onnx.py     |  2 +-
 ...r_original_pytorch_lightning_to_pytorch.py |  6 +--
 .../convert_marian_tatoeba_to_pytorch.py      |  2 +-
 ...ert_slow_tokenizers_checkpoints_to_fast.py |  2 +-
 ...xlnet_original_tf_checkpoint_to_pytorch.py |  2 +-
 src/transformers/data/data_collator.py        |  2 +-
 .../data/datasets/language_modeling.py        |  2 +-
 src/transformers/data/processors/squad.py     |  4 +-
 src/transformers/data/processors/utils.py     |  3 --
 src/transformers/file_utils.py                | 36 +++++++--------
 src/transformers/generation_tf_utils.py       | 26 +++++------
 src/transformers/generation_utils.py          | 18 ++++----
 src/transformers/modelcard.py                 |  2 +-
 src/transformers/modeling_auto.py             |  6 +--
 src/transformers/modeling_bart.py             |  2 +-
 src/transformers/modeling_deberta.py          | 14 +++---
 src/transformers/modeling_distilbert.py       |  2 +-
 src/transformers/modeling_dpr.py              |  2 +-
 src/transformers/modeling_encoder_decoder.py  |  2 +-
 src/transformers/modeling_flaubert.py         |  2 +-
 src/transformers/modeling_flax_auto.py        | 12 ++---
 src/transformers/modeling_flax_roberta.py     |  2 +-
 src/transformers/modeling_funnel.py           |  4 +-
 src/transformers/modeling_gpt2.py             |  6 +--
 src/transformers/modeling_longformer.py       | 10 ++--
 src/transformers/modeling_lxmert.py           | 14 ++----
 src/transformers/modeling_mobilebert.py       |  2 +-
 src/transformers/modeling_rag.py              | 20 ++++----
 src/transformers/modeling_reformer.py         |  4 +-
 src/transformers/modeling_retribert.py        |  6 +--
 src/transformers/modeling_t5.py               |  8 ++--
 src/transformers/modeling_tf_albert.py        |  2 +-
 src/transformers/modeling_tf_auto.py          | 12 ++---
 src/transformers/modeling_tf_bart.py          |  2 +-
 src/transformers/modeling_tf_bert.py          |  2 +-
 src/transformers/modeling_tf_distilbert.py    |  6 +--
 src/transformers/modeling_tf_flaubert.py      |  6 +--
 src/transformers/modeling_tf_funnel.py        |  4 +-
 src/transformers/modeling_tf_gpt2.py          |  2 +-
 src/transformers/modeling_tf_longformer.py    | 14 +++---
 src/transformers/modeling_tf_lxmert.py        |  8 ++--
 src/transformers/modeling_tf_mobilebert.py    |  2 +-
 src/transformers/modeling_tf_openai.py        |  2 +-
 src/transformers/modeling_tf_pytorch_utils.py |  4 +-
 src/transformers/modeling_tf_roberta.py       |  6 +--
 src/transformers/modeling_tf_t5.py            | 10 ++--
 src/transformers/modeling_tf_transfo_xl.py    |  2 +-
 src/transformers/modeling_tf_utils.py         |  8 ++--
 src/transformers/modeling_tf_xlm.py           |  4 +-
 src/transformers/modeling_tf_xlnet.py         |  4 +-
 src/transformers/modeling_utils.py            |  2 +-
 src/transformers/modeling_xlm.py              |  2 +-
 src/transformers/modeling_xlnet.py            |  2 +-
 src/transformers/optimization_tf.py           |  2 +-
 src/transformers/retrieval_rag.py             |  2 -
 src/transformers/tokenization_bert.py         |  2 +-
 src/transformers/tokenization_bertweet.py     |  2 +-
 src/transformers/tokenization_camembert.py    |  2 +-
 .../tokenization_camembert_fast.py            |  2 +-
 src/transformers/tokenization_deberta.py      |  2 +-
 src/transformers/tokenization_fsmt.py         |  2 +-
 src/transformers/tokenization_herbert.py      |  2 +-
 src/transformers/tokenization_herbert_fast.py |  6 +--
 src/transformers/tokenization_mbart.py        |  2 +-
 src/transformers/tokenization_mbart_fast.py   |  2 +-
 src/transformers/tokenization_phobert.py      |  2 +-
 src/transformers/tokenization_prophetnet.py   |  2 +-
 src/transformers/tokenization_transfo_xl.py   |  2 +-
 src/transformers/tokenization_utils.py        | 10 ++--
 src/transformers/tokenization_utils_base.py   | 46 ++++++++-----------
 src/transformers/tokenization_utils_fast.py   |  6 +--
 src/transformers/tokenization_xlm.py          |  2 +-
 .../tokenization_xlm_prophetnet.py            |  2 +-
 src/transformers/tokenization_xlm_roberta.py  |  2 +-
 .../tokenization_xlm_roberta_fast.py          |  2 +-
 src/transformers/tokenization_xlnet.py        |  2 +-
 src/transformers/tokenization_xlnet_fast.py   |  2 +-
 src/transformers/trainer.py                   |  2 -
 src/transformers/trainer_pt_utils.py          |  2 +-
 src/transformers/training_args.py             |  2 +-
 src/transformers/utils/notebook.py            |  4 +-
 .../adding_a_new_example_script/run_xxx.py    |  2 +-
 .../adding_a_new_example_script/utils_xxx.py  |  4 +-
 .../adding_a_new_model/configuration_xxx.py   |  2 +-
 .../adding_a_new_model/modeling_tf_xxx.py     |  2 +-
 .../adding_a_new_model/tokenization_xxx.py    |  2 +-
 tests/test_logging.py                         |  2 +-
 tests/test_modeling_common.py                 |  4 +-
 tests/test_modeling_tf_lxmert.py              |  2 +-
 utils/check_copies.py                         |  6 +--
 160 files changed, 342 insertions(+), 364 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3ce5a6e42f..6ea1a34a0e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -96,7 +96,7 @@ folder.
 
 ## Start contributing! (Pull Requests)
 
-Before writing code, we strongly advise you to search through the exising PRs or
+Before writing code, we strongly advise you to search through the existing PRs or
 issues to make sure that nobody is already working on the same thing. If you are
 unsure, it is always a good idea to open an issue to get some feedback.
 
@@ -235,7 +235,7 @@ Follow these steps to start contributing:
 ### Checklist
 
 1. The title of your pull request should be a summary of its contribution;
-2. If your pull request adresses an issue, please mention the issue number in
+2. If your pull request addresses an issue, please mention the issue number in
    the pull request description to make sure they are linked (and people
    consulting the issue know you are working on it);
 3. To indicate a work in progress please prefix the title with `[WIP]`. These
diff --git a/docs/source/installation.md b/docs/source/installation.md
index 6c5b81ca64..8e5a37af4b 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -80,9 +80,9 @@ cache home followed by ``/transformers/`` (even if you don't have PyTorch instal
 So if you don't have any specific environment variable set, the cache directory will be at
 ``~/.cache/torch/transformers/``.
 
-**Note:** If you have set a shell enviromnent variable for one of the predecessors of this library
+**Note:** If you have set a shell environment variable for one of the predecessors of this library
 (``PYTORCH_TRANSFORMERS_CACHE`` or ``PYTORCH_PRETRAINED_BERT_CACHE``), those will be used if there is no shell
-enviromnent variable for ``TRANSFORMERS_CACHE``.
+environment variable for ``TRANSFORMERS_CACHE``.
 
 ### Note on model downloads (Continuous Integration or large-scale deployments)
 
diff --git a/docs/source/migration.md b/docs/source/migration.md
index 0cf53e1fea..f3b1b55b54 100644
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -20,7 +20,7 @@ Here is a quick summary of what you should take care of when migrating from `pyt
 
 The main breaking change when migrating from `pytorch-pretrained-bert` to 🤗 Transformers is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
 
-The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
+The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
 
 In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
 
@@ -109,7 +109,7 @@ for batch in train_data:
     loss.backward()
     optimizer.step()
 
-### In 🤗 Transformers, optimizer and schedules are splitted and instantiated like this:
+### In 🤗 Transformers, optimizer and schedules are split and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
 scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler
 ### and used like this:
diff --git a/docs/source/model_sharing.rst b/docs/source/model_sharing.rst
index 991b0132cd..065693e150 100644
--- a/docs/source/model_sharing.rst
+++ b/docs/source/model_sharing.rst
@@ -119,7 +119,7 @@ Other files can safely be deleted.
 Upload your model with the CLI
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Now go in a terminal and run the following command. It should be in the virtual enviromnent where you installed 🤗
+Now go in a terminal and run the following command. It should be in the virtual environment where you installed 🤗
 Transformers, since that command :obj:`transformers-cli` comes from the library.
 
 .. code-block::
diff --git a/docs/source/task_summary.rst b/docs/source/task_summary.rst
index dd451f244f..d92c849845 100644
--- a/docs/source/task_summary.rst
+++ b/docs/source/task_summary.rst
@@ -510,8 +510,8 @@ As a default all models apply *Top-K* sampling when used in pipelines, as config
 
 
 Here, the model generates a random text with a total maximal length of *50* tokens from context *"As far as I am
-concerned, I will"*. The default arguments of ``PreTrainedModel.generate()`` can be directly overriden in the pipeline,
-as is shown above for the argument ``max_length``.
+concerned, I will"*. The default arguments of ``PreTrainedModel.generate()`` can be directly overridden in the
+pipeline, as is shown above for the argument ``max_length``.
 
 Here is an example of text generation using ``XLNet`` and its tokenzier.
 
diff --git a/examples/adversarial/utils_hans.py b/examples/adversarial/utils_hans.py
index ffe6145e29..bf0623ffb1 100644
--- a/examples/adversarial/utils_hans.py
+++ b/examples/adversarial/utils_hans.py
@@ -291,10 +291,9 @@ def hans_convert_examples_to_features(
 
     Args:
         examples: List of ``InputExamples`` containing the examples.
-        tokenizer: Instance of a tokenizer that will tokenize the examples.
-        max_length: Maximum example length.
         label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method.
-        output_mode: String indicating the output mode. Either ``regression`` or ``classification``.
+        max_length: Maximum example length.
+        tokenizer: Instance of a tokenizer that will tokenize the examples.
 
     Returns:
         A list of task-specific ``InputFeatures`` which can be fed to the model.
diff --git a/examples/bert-loses-patience/pabee/modeling_pabee_bert.py b/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
index 78de015b1d..6852ab0bd9 100644
--- a/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
+++ b/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
@@ -155,7 +155,7 @@ class BertModelWithPabee(BertModel):
         extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
 
         # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if self.config.is_decoder and encoder_hidden_states is not None:
             encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
diff --git a/examples/deebert/src/modeling_highway_bert.py b/examples/deebert/src/modeling_highway_bert.py
index 84fc10bd0e..5635fbee5f 100644
--- a/examples/deebert/src/modeling_highway_bert.py
+++ b/examples/deebert/src/modeling_highway_bert.py
@@ -198,7 +198,7 @@ class DeeBertModel(BertPreTrainedModel):
         extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
 
         # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if encoder_attention_mask.dim() == 3:
             encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
         if encoder_attention_mask.dim() == 2:
@@ -260,7 +260,7 @@ class BertHighway(nn.Module):
 
         # BertModel
         bmodel_output = (pooler_input, pooler_output) + encoder_outputs[1:]
-        # "return" bodel_output
+        # "return" bmodel_output
 
         # Dropout and classification
         pooled_output = bmodel_output[1]
diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py
index 893d9916a9..d724ac6e29 100644
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -265,7 +265,7 @@ class Distiller:
         -------
             token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
             attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
-            clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -100 where there is nothing to predict.
+            clm_labels: `torch.tensor(bs, seq_length)` - The causal language modeling labels. There is a -100 where there is nothing to predict.
         """
         token_ids, lengths = batch
         token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
@@ -401,9 +401,9 @@ class Distiller:
         # https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
         # https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
         if self.params.restrict_ce_to_mask:
-            mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits)  # (bs, seq_lenth, voc_size)
+            mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits)  # (bs, seq_length, voc_size)
         else:
-            mask = attention_mask.unsqueeze(-1).expand_as(s_logits)  # (bs, seq_lenth, voc_size)
+            mask = attention_mask.unsqueeze(-1).expand_as(s_logits)  # (bs, seq_length, voc_size)
         s_logits_slct = torch.masked_select(s_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
         s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
         t_logits_slct = torch.masked_select(t_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
diff --git a/examples/distillation/lm_seqs_dataset.py b/examples/distillation/lm_seqs_dataset.py
index 0c793942c6..8e0a5814ab 100644
--- a/examples/distillation/lm_seqs_dataset.py
+++ b/examples/distillation/lm_seqs_dataset.py
@@ -61,7 +61,7 @@ class LmSeqsDataset(Dataset):
 
     def remove_long_sequences(self):
         """
-        Sequences that are too long are splitted by chunk of max_model_input_size.
+        Sequences that are too long are split by chunk of max_model_input_size.
         """
         max_len = self.params.max_model_input_size
         indices = self.lengths > max_len
@@ -138,8 +138,8 @@ class LmSeqsDataset(Dataset):
         # logger.info(f'{data_len} tokens ({nb_unique_tokens} unique)')
 
         # unk_idx = self.params.special_tok_ids['unk_token']
-        # nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
-        # logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')
+        # nb_unknown = sum([(t==unk_idx).sum() for t in self.token_ids])
+        # logger.info(f'{nb_unknown} unknown tokens (covering {100*nb_unknown/data_len:.2f}% of the data)')
 
     def batch_sequences(self, batch):
         """
diff --git a/examples/distillation/scripts/extract.py b/examples/distillation/scripts/extract.py
index b4bea90d53..d7a99b1d89 100644
--- a/examples/distillation/scripts/extract.py
+++ b/examples/distillation/scripts/extract.py
@@ -96,7 +96,7 @@ if __name__ == "__main__":
         compressed_sd["lm_head.weight"] = state_dict["lm_head.weight"]
 
     print(f"N layers selected for distillation: {std_idx}")
-    print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
+    print(f"Number of params transferred for distillation: {len(compressed_sd.keys())}")
 
-    print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
+    print(f"Save transferred checkpoint to {args.dump_checkpoint}.")
     torch.save(compressed_sd, args.dump_checkpoint)
diff --git a/examples/lxmert/modeling_frcnn.py b/examples/lxmert/modeling_frcnn.py
index 31fc2cb261..40b0e4bbfb 100644
--- a/examples/lxmert/modeling_frcnn.py
+++ b/examples/lxmert/modeling_frcnn.py
@@ -266,14 +266,14 @@ def find_top_rpn_proposals(
 ):
     """Args:
         proposals (list[Tensor]): (L, N, Hi*Wi*A, 4).
-        pred_objectness_logits: tensors of lenngth L.
+        pred_objectness_logits: tensors of length L.
         nms_thresh (float): IoU threshold to use for NMS
         pre_nms_topk (int): before nms
         post_nms_topk (int): after nms
         min_box_side_len (float): minimum proposal box side
         training (bool): True if proposals are to be used in training,
     Returns:
-        resuls (List[Dict]): stores post_nms_topk object proposals for image i.
+        results (List[Dict]): stores post_nms_topk object proposals for image i.
     """
     num_images = len(images)
     device = proposals[0].device
@@ -648,7 +648,7 @@ class RPNOutputs(object):
             images (ImageList): :class:`ImageList` instance representing N input images
             pred_objectness_logits (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, A, Hi, W)
             pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, A*4, Hi, Wi)
-            anchors (list[torch.Tensor]): nested list ofboxes. anchors[i][j] at (n, l) stores anchor array for feature map l
+            anchors (list[torch.Tensor]): nested list of boxes. anchors[i][j] at (n, l) stores anchor array for feature map l
             boundary_threshold (int): if >= 0, then anchors that extend beyond the image boundary by more than boundary_thresh are not used in training.
             gt_boxes (list[Boxes], optional): A list of N elements.
             smooth_l1_beta (float): The transition point between L1 and L2 lossn. When set to 0, the loss becomes L1. When +inf, it is ignored
@@ -1186,7 +1186,7 @@ class ROIOutputs(object):
         attr_probs_all, attrs_all = self._predict_attrs(attr_logits, preds_per_image)
         features = features.split(preds_per_image, dim=0)
 
-        # fun for each image too, also I can expirement and do multiple images
+        # fun for each image too, also I can experiment and do multiple images
         final_results = []
         zipped = zip(boxes_all, obj_scores_all, attr_probs_all, attrs_all, sizes)
         for i, (boxes, obj_scores, attr_probs, attrs, size) in enumerate(zipped):
@@ -1412,7 +1412,7 @@ class AnchorGenerator(nn.Module):
 
     def generate_cell_anchors(self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)):
         """
-        anchors are continious geometric rectangles
+        anchors are continuous geometric rectangles
         centered on one feature map point sample.
         We can later build the set of anchors
         for the entire feature map by tiling these tensors
@@ -1865,7 +1865,7 @@ class GeneralizedRCNN(nn.Module):
         scales_yx=None,
         **kwargs,
     ):
-        # run images through bacbone
+        # run images through backbone
         original_sizes = image_shapes * scales_yx
         features = self.backbone(images)
 
diff --git a/examples/lxmert/processing_image.py b/examples/lxmert/processing_image.py
index 7b56554b4e..ff449985b0 100644
--- a/examples/lxmert/processing_image.py
+++ b/examples/lxmert/processing_image.py
@@ -116,7 +116,7 @@ class Preprocess:
             images = self.aug(images)
             # transpose images and convert to torch tensors
             # images = [torch.as_tensor(i.astype("float32")).permute(2, 0, 1).to(self.device) for i in images]
-            # now normalize before pad to aoid useless arithmatic
+            # now normalize before pad to avoid useless arithmetic
             images = [self.normalizer(x) for x in images]
             # now pad them to do the following operations
             images, sizes = self.pad(images)
diff --git a/examples/lxmert/utils.py b/examples/lxmert/utils.py
index f69bcecb17..1faf9feffa 100644
--- a/examples/lxmert/utils.py
+++ b/examples/lxmert/utils.py
@@ -236,7 +236,7 @@ def compare(in_tensor):
     ), f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x == False])/len(n1.flatten())*100:.4f} % element-wise mismatch"
     raise Exception("tensors are all good")
 
-    # Hugging face functiions below
+    # Hugging face functions below
 
 
 def is_remote_url(url_or_filename):
@@ -520,7 +520,7 @@ def get_image_from_url(url):
     return img
 
 
-# to load legace frcnn checkpoint from detectron
+# to load legacy frcnn checkpoint from detectron
 def load_frcnn_pkl_from_url(url):
     fn = url.split("/")[-1]
     if fn not in os.listdir(os.getcwd()):
diff --git a/examples/movement-pruning/counts_parameters.py b/examples/movement-pruning/counts_parameters.py
index 8553f6f812..0dddfaaa27 100644
--- a/examples/movement-pruning/counts_parameters.py
+++ b/examples/movement-pruning/counts_parameters.py
@@ -33,7 +33,7 @@ def main(args):
     remaining_count = 0  # Number of remaining (not pruned) params in the encoder
     encoder_count = 0  # Number of params in the encoder
 
-    print("name".ljust(60, " "), "Remaining Weights %", "Remaning Weight")
+    print("name".ljust(60, " "), "Remaining Weights %", "Remaining Weight")
     for name, param in st.items():
         if "encoder" not in name:
             continue
diff --git a/examples/movement-pruning/emmental/modeling_bert_masked.py b/examples/movement-pruning/emmental/modeling_bert_masked.py
index bcc8690c39..c4f5a422a2 100644
--- a/examples/movement-pruning/emmental/modeling_bert_masked.py
+++ b/examples/movement-pruning/emmental/modeling_bert_masked.py
@@ -591,7 +591,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 
         # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if self.config.is_decoder and encoder_hidden_states is not None:
             encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
@@ -631,7 +631,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
                 )  # We can specify head_mask for each layer
             head_mask = head_mask.to(
                 dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
+            )  # switch to float if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.num_hidden_layers
 
diff --git a/examples/movement-pruning/masked_run_glue.py b/examples/movement-pruning/masked_run_glue.py
index 09dfc8cf6e..b07fe03d29 100644
--- a/examples/movement-pruning/masked_run_glue.py
+++ b/examples/movement-pruning/masked_run_glue.py
@@ -225,7 +225,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
         desc="Epoch",
         disable=args.local_rank not in [-1, 0],
     )
-    set_seed(args)  # Added here for reproductibility
+    set_seed(args)  # Added here for reproducibility
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
@@ -705,7 +705,7 @@ def main():
         "--final_lambda",
         default=0.0,
         type=float,
-        help="Regularization intensity (used in conjunction with `regulariation`.",
+        help="Regularization intensity (used in conjunction with `regularization`.",
     )
 
     parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
@@ -816,7 +816,7 @@ def main():
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
         args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+    else:  # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
         torch.distributed.init_process_group(backend="nccl")
diff --git a/examples/movement-pruning/masked_run_squad.py b/examples/movement-pruning/masked_run_squad.py
index 1311dd620d..56d13b6f97 100644
--- a/examples/movement-pruning/masked_run_squad.py
+++ b/examples/movement-pruning/masked_run_squad.py
@@ -231,7 +231,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
     train_iterator = trange(
         epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
     )
-    # Added here for reproductibility
+    # Added here for reproducibility
     set_seed(args)
 
     for _ in train_iterator:
@@ -824,7 +824,7 @@ def main():
         "--final_lambda",
         default=0.0,
         type=float,
-        help="Regularization intensity (used in conjunction with `regulariation`.",
+        help="Regularization intensity (used in conjunction with `regularization`.",
     )
 
     parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
@@ -977,7 +977,7 @@ def main():
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
         args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+    else:  # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
         torch.distributed.init_process_group(backend="nccl")
diff --git a/examples/rag/distributed_retriever.py b/examples/rag/distributed_retriever.py
index 4fdb1838b8..738ebda99e 100644
--- a/examples/rag/distributed_retriever.py
+++ b/examples/rag/distributed_retriever.py
@@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
 class RagPyTorchDistributedRetriever(RagRetriever):
     """
     A distributed retriever built on top of the ``torch.distributed`` communication package. During training all workers
-    initalize their own instance of the retriever, however, only the main worker loads the index into memory. The index is stored
+    initialize their own instance of the retriever, however, only the main worker loads the index into memory. The index is stored
     in cpu memory. The index will also work well in a non-distributed setup.
 
     Args:
@@ -45,7 +45,7 @@ class RagPyTorchDistributedRetriever(RagRetriever):
 
     def init_retrieval(self, distributed_port: int):
         """
-        Retriever initalization function, needs to be called from the training process. The function sets some common parameters
+        Retriever initialization function, needs to be called from the training process. The function sets some common parameters
         and environment variables. On top of that, (only) the main process in the process group loads the index into memory.
 
         Args:
@@ -56,7 +56,7 @@ class RagPyTorchDistributedRetriever(RagRetriever):
 
         logger.info("initializing retrieval")
 
-        # initializing a separate process group for retrievel as the default
+        # initializing a separate process group for retrieval as the default
         # nccl backend doesn't support gather/scatter operations while gloo
         # is too slow to replace nccl for the core gpu communication
         if dist.is_initialized():
@@ -101,7 +101,7 @@ class RagPyTorchDistributedRetriever(RagRetriever):
             n_docs (:obj:`int`):
                 The number of docs retrieved per query.
 
-        Ouput:
+        Output:
             retrieved_doc_embeds (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)`
                 The retrieval embeddings of the retrieved docs per query.
             doc_ids (:obj:`np.ndarray` of shape :obj:`batch_size, n_docs`)
diff --git a/examples/rag/eval_rag.py b/examples/rag/eval_rag.py
index 73913c1acd..fd0c9711a6 100644
--- a/examples/rag/eval_rag.py
+++ b/examples/rag/eval_rag.py
@@ -176,7 +176,7 @@ def get_args():
         choices=["e2e", "retrieval"],
         default="e2e",
         type=str,
-        help="Evaluation mode, e2e calculates exact match and F1 of the downstream task, retrieval calulates precision@k.",
+        help="Evaluation mode, e2e calculates exact match and F1 of the downstream task, retrieval calculates precision@k.",
     )
     parser.add_argument("--k", default=1, type=int, help="k for the precision@k calculation")
     parser.add_argument(
@@ -206,7 +206,7 @@ def get_args():
         "--predictions_path",
         type=str,
         default="predictions.txt",
-        help="Name of the predictions file, to be stored in the checkpoints directry",
+        help="Name of the predictions file, to be stored in the checkpoints directory",
     )
     parser.add_argument(
         "--eval_all_checkpoints",
diff --git a/examples/rag/use_own_knowledge_dataset.py b/examples/rag/use_own_knowledge_dataset.py
index bfdec26275..fd465e6900 100644
--- a/examples/rag/use_own_knowledge_dataset.py
+++ b/examples/rag/use_own_knowledge_dataset.py
@@ -26,7 +26,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 def split_text(text: str, n=100, character=" ") -> List[str]:
-    """Split the text every ``n``-th occurence of ``character``"""
+    """Split the text every ``n``-th occurrence of ``character``"""
     text = text.split(character)
     return [character.join(text[i : i + n]).strip() for i in range(0, len(text), n)]
 
diff --git a/examples/seq2seq/bertabs/configuration_bertabs.py b/examples/seq2seq/bertabs/configuration_bertabs.py
index 29dd46362f..68dc0df814 100644
--- a/examples/seq2seq/bertabs/configuration_bertabs.py
+++ b/examples/seq2seq/bertabs/configuration_bertabs.py
@@ -44,7 +44,7 @@ class BertAbsConfig(PretrainedConfig):
         enc_ff_size: int
             The size of the encoder's feed-forward layers.
         enc_dropout: int
-            The dropout probabilitiy for all fully connected layers in the
+            The dropout probability for all fully connected layers in the
             embeddings, layers, pooler and also the attention probabilities in
             the encoder.
         dec_layer: int
@@ -56,7 +56,7 @@ class BertAbsConfig(PretrainedConfig):
         dec_ff_size: int
             The size of the decoder's feed-forward layers.
         dec_dropout: int
-            The dropout probabilitiy for all fully connected layers in the
+            The dropout probability for all fully connected layers in the
             embeddings, layers, pooler and also the attention probabilities in
             the decoder.
     """
diff --git a/examples/seq2seq/bertabs/modeling_bertabs.py b/examples/seq2seq/bertabs/modeling_bertabs.py
index 103c0b4d5b..ce0e25e2b1 100644
--- a/examples/seq2seq/bertabs/modeling_bertabs.py
+++ b/examples/seq2seq/bertabs/modeling_bertabs.py
@@ -152,7 +152,7 @@ class TransformerDecoder(nn.Module):
        dropout (float): dropout parameters
        embeddings (:obj:`onmt.modules.Embeddings`):
           embeddings to use, should have positional encodings
-       attn_type (str): if using a seperate copy attention
+       attn_type (str): if using a separate copy attention
     """
 
     def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, vocab_size):
@@ -817,11 +817,7 @@ class Translator(object):
 
         Args:
            batch (:obj:`Batch`): a batch from a dataset object
-           data (:obj:`Dataset`): the dataset object
            fast (bool): enables fast beam search (may not support all features)
-
-        Todo:
-           Shouldn't need the original dataset.
         """
         with torch.no_grad():
             return self._fast_translate_batch(batch, self.max_length, min_length=self.min_length)
diff --git a/examples/seq2seq/convert_model_to_fp16.py b/examples/seq2seq/convert_model_to_fp16.py
index 26b1ff8fd8..e853d0393c 100755
--- a/examples/seq2seq/convert_model_to_fp16.py
+++ b/examples/seq2seq/convert_model_to_fp16.py
@@ -12,7 +12,7 @@ def convert(src_path: str, map_location: str = "cpu", save_path: Union[str, None
     state_dict = torch.load(src_path, map_location=map_location)
     for k, v in tqdm(state_dict.items()):
         if not isinstance(v, torch.Tensor):
-            raise TypeError("FP16 conversion only works on paths that are saved state dics, like pytorch_model.bin")
+            raise TypeError("FP16 conversion only works on paths that are saved state dicts, like pytorch_model.bin")
         state_dict[k] = v.half()
     if save_path is None:  # overwrite src_path
         save_path = src_path
diff --git a/model_cards/aubmindlab/bert-base-arabert/README.md b/model_cards/aubmindlab/bert-base-arabert/README.md
index 4b6ced3442..772676b6dc 100644
--- a/model_cards/aubmindlab/bert-base-arabert/README.md
+++ b/model_cards/aubmindlab/bert-base-arabert/README.md
@@ -7,7 +7,7 @@ language: ar
 
 **AraBERT** is an Arabic pretrained lanaguage model based on [Google's BERT architechture](https://github.com/google-research/bert). AraBERT uses the same BERT-Base config. More details are available in the [AraBERT PAPER](https://arxiv.org/abs/2003.00104v2) and in the [AraBERT Meetup](https://github.com/WissamAntoun/pydata_khobar_meetup)
 
-There are two version off the model AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were splitted using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
+There are two version off the model AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were split using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
 
 The model was trained on ~70M sentences or ~23GB of Arabic text with ~3B words. The training corpora are a collection of publically available large scale raw arabic text ([Arabic Wikidumps](https://archive.org/details/arwiki-20190201), [The 1.5B words Arabic Corpus](https://www.semanticscholar.org/paper/1.5-billion-words-Arabic-Corpus-El-Khair/f3eeef4afb81223df96575adadf808fe7fe440b4), [The OSIAN Corpus](https://www.aclweb.org/anthology/W19-4619), Assafir news articles, and 4 other manually crawled news websites (Al-Akhbar, Annahar, AL-Ahram, AL-Wafd) from [the Wayback Machine](http://web.archive.org/))
 
diff --git a/model_cards/aubmindlab/bert-base-arabertv01/README.md b/model_cards/aubmindlab/bert-base-arabertv01/README.md
index 4b6ced3442..772676b6dc 100644
--- a/model_cards/aubmindlab/bert-base-arabertv01/README.md
+++ b/model_cards/aubmindlab/bert-base-arabertv01/README.md
@@ -7,7 +7,7 @@ language: ar
 
 **AraBERT** is an Arabic pretrained lanaguage model based on [Google's BERT architechture](https://github.com/google-research/bert). AraBERT uses the same BERT-Base config. More details are available in the [AraBERT PAPER](https://arxiv.org/abs/2003.00104v2) and in the [AraBERT Meetup](https://github.com/WissamAntoun/pydata_khobar_meetup)
 
-There are two version off the model AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were splitted using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
+There are two version off the model AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were split using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
 
 The model was trained on ~70M sentences or ~23GB of Arabic text with ~3B words. The training corpora are a collection of publically available large scale raw arabic text ([Arabic Wikidumps](https://archive.org/details/arwiki-20190201), [The 1.5B words Arabic Corpus](https://www.semanticscholar.org/paper/1.5-billion-words-Arabic-Corpus-El-Khair/f3eeef4afb81223df96575adadf808fe7fe440b4), [The OSIAN Corpus](https://www.aclweb.org/anthology/W19-4619), Assafir news articles, and 4 other manually crawled news websites (Al-Akhbar, Annahar, AL-Ahram, AL-Wafd) from [the Wayback Machine](http://web.archive.org/))
 
diff --git a/model_cards/elgeish/cs224n-squad2.0-albert-large-v2/README.md b/model_cards/elgeish/cs224n-squad2.0-albert-large-v2/README.md
index 6ecad5d326..5f365d2d7b 100644
--- a/model_cards/elgeish/cs224n-squad2.0-albert-large-v2/README.md
+++ b/model_cards/elgeish/cs224n-squad2.0-albert-large-v2/README.md
@@ -4,7 +4,7 @@ tags:
 ---
 
 ## CS224n SQuAD2.0 Project Dataset
-The goal of this model is to save CS224n students GPU time when establising
+The goal of this model is to save CS224n students GPU time when establishing
 baselines to beat for the [Default Final Project](http://web.stanford.edu/class/cs224n/project/default-final-project-handout.pdf).
 The training set used to fine-tune this model is the same as
 the [official one](https://rajpurkar.github.io/SQuAD-explorer/); however,
diff --git a/model_cards/jannesg/takalane_afr_roberta/README.md b/model_cards/jannesg/takalane_afr_roberta/README.md
index d43471c4f7..5b0573fef4 100644
--- a/model_cards/jannesg/takalane_afr_roberta/README.md
+++ b/model_cards/jannesg/takalane_afr_roberta/README.md
@@ -34,7 +34,7 @@ model = AutoModelWithLMHead.from_pretrained("jannesg/takalane_afr_roberta")
 
 #### Limitations and bias
 
-Updates will be added continously to improve performance. 
+Updates will be added continuously to improve performance. 
 
 ## Training data
 
diff --git a/model_cards/mrm8488/CodeBERTaPy/README.md b/model_cards/mrm8488/CodeBERTaPy/README.md
index 95f471a54c..e29377bdae 100644
--- a/model_cards/mrm8488/CodeBERTaPy/README.md
+++ b/model_cards/mrm8488/CodeBERTaPy/README.md
@@ -94,7 +94,7 @@ fill_mask(PYTHON_CODE3)
 
 > Great! 🎉
 
-## This work is heavely inspired on [CodeBERTa](https://github.com/huggingface/transformers/blob/master/model_cards/huggingface/CodeBERTa-small-v1/README.md) by huggingface team
+## This work is heavily inspired on [CodeBERTa](https://github.com/huggingface/transformers/blob/master/model_cards/huggingface/CodeBERTa-small-v1/README.md) by huggingface team
 
 <br>
 
diff --git a/model_cards/mrm8488/TinyBERT-spanish-uncased-finetuned-ner/README.md b/model_cards/mrm8488/TinyBERT-spanish-uncased-finetuned-ner/README.md
index aefb1fe7d9..1a595fe1ba 100644
--- a/model_cards/mrm8488/TinyBERT-spanish-uncased-finetuned-ner/README.md
+++ b/model_cards/mrm8488/TinyBERT-spanish-uncased-finetuned-ner/README.md
@@ -11,7 +11,7 @@ This model is a fine-tuned on [NER-C](https://www.kaggle.com/nltkdata/conll-corp
 
 - [Dataset:  CONLL Corpora ES](https://www.kaggle.com/nltkdata/conll-corpora) 
 
-I preprocessed the dataset and splitted it as train / dev (80/20)
+I preprocessed the dataset and split it as train / dev (80/20)
 
 | Dataset                | # Examples |
 | ---------------------- | ----- |
diff --git a/model_cards/mrm8488/bert-multi-cased-finetuned-xquadv1/README.md b/model_cards/mrm8488/bert-multi-cased-finetuned-xquadv1/README.md
index 8cafde0da2..7849ec85f1 100644
--- a/model_cards/mrm8488/bert-multi-cased-finetuned-xquadv1/README.md
+++ b/model_cards/mrm8488/bert-multi-cased-finetuned-xquadv1/README.md
@@ -65,7 +65,7 @@ Citation:
 
 </details>
 
-As **XQuAD** is just an evaluation dataset, I used `Data augmentation techniques` (scraping, neural machine translation, etc) to obtain more samples and splited the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
+As **XQuAD** is just an evaluation dataset, I used `Data augmentation techniques` (scraping, neural machine translation, etc) to obtain more samples and split the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
 
 | Dataset     | # samples |
 | ----------- | --------- |
diff --git a/model_cards/mrm8488/bert-multi-uncased-finetuned-xquadv1/README.md b/model_cards/mrm8488/bert-multi-uncased-finetuned-xquadv1/README.md
index 39368ef365..f04c569885 100644
--- a/model_cards/mrm8488/bert-multi-uncased-finetuned-xquadv1/README.md
+++ b/model_cards/mrm8488/bert-multi-uncased-finetuned-xquadv1/README.md
@@ -65,7 +65,7 @@ Citation:
 
 </details>
 
-As **XQuAD** is just an evaluation dataset, I used `Data augmentation techniques` (scraping, neural machine translation, etc) to obtain more samples and splited the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
+As **XQuAD** is just an evaluation dataset, I used `Data augmentation techniques` (scraping, neural machine translation, etc) to obtain more samples and split the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
 
 | Dataset     | # samples |
 | ----------- | --------- |
diff --git a/model_cards/mrm8488/bert-spanish-cased-finetuned-ner/README.md b/model_cards/mrm8488/bert-spanish-cased-finetuned-ner/README.md
index 445a942f66..67465c9ea8 100644
--- a/model_cards/mrm8488/bert-spanish-cased-finetuned-ner/README.md
+++ b/model_cards/mrm8488/bert-spanish-cased-finetuned-ner/README.md
@@ -11,7 +11,7 @@ This model is a fine-tuned on [NER-C](https://www.kaggle.com/nltkdata/conll-corp
 
 - [Dataset:  CONLL Corpora ES](https://www.kaggle.com/nltkdata/conll-corpora) 
 
-I preprocessed the dataset and splitted it as train / dev (80/20)
+I preprocessed the dataset and split it as train / dev (80/20)
 
 | Dataset                | # Examples |
 | ---------------------- | ----- |
diff --git a/model_cards/mrm8488/bert-spanish-cased-finetuned-pos/README.md b/model_cards/mrm8488/bert-spanish-cased-finetuned-pos/README.md
index 5cc55b9899..e1827e4eff 100644
--- a/model_cards/mrm8488/bert-spanish-cased-finetuned-pos/README.md
+++ b/model_cards/mrm8488/bert-spanish-cased-finetuned-pos/README.md
@@ -11,7 +11,7 @@ This model is a fine-tuned on Spanish [CONLL CORPORA](https://www.kaggle.com/nlt
 
 - [Dataset:  CONLL Corpora ES](https://www.kaggle.com/nltkdata/conll-corpora) with data augmentation techniques
 
-I preprocessed the dataset and splitted it as train / dev (80/20)
+I preprocessed the dataset and split it as train / dev (80/20)
 
 | Dataset                | # Examples |
 | ---------------------- | ----- |
diff --git a/model_cards/mrm8488/mobilebert-uncased-finetuned-squadv1/README.md b/model_cards/mrm8488/mobilebert-uncased-finetuned-squadv1/README.md
index 55ca9b6c75..68a87d9f9a 100644
--- a/model_cards/mrm8488/mobilebert-uncased-finetuned-squadv1/README.md
+++ b/model_cards/mrm8488/mobilebert-uncased-finetuned-squadv1/README.md
@@ -44,7 +44,7 @@ python transformers/examples/question-answering/run_squad.py \
   --save_steps 1000
 ```
 
-It is importatnt to say that this models converges much faster than other ones. So, it is also cheap to fine-tune.
+It is important to say that this models converges much faster than other ones. So, it is also cheap to fine-tune.
 
 ## Test set Results 🧾
 
diff --git a/model_cards/mrm8488/mobilebert-uncased-finetuned-squadv2/README.md b/model_cards/mrm8488/mobilebert-uncased-finetuned-squadv2/README.md
index 4e925af9c5..3bd933b77d 100644
--- a/model_cards/mrm8488/mobilebert-uncased-finetuned-squadv2/README.md
+++ b/model_cards/mrm8488/mobilebert-uncased-finetuned-squadv2/README.md
@@ -44,7 +44,7 @@ python transformers/examples/question-answering/run_squad.py \
   --version_2_with_negative
 ```
 
-It is importatnt to say that this models converges much faster than other ones. So, it is also cheap to fine-tune.
+It is important to say that this models converges much faster than other ones. So, it is also cheap to fine-tune.
 
 ## Test set Results 🧾
 
diff --git a/model_cards/mrm8488/spanbert-base-finetuned-squadv1/README.md b/model_cards/mrm8488/spanbert-base-finetuned-squadv1/README.md
index f31d384aab..8ebb811418 100644
--- a/model_cards/mrm8488/spanbert-base-finetuned-squadv1/README.md
+++ b/model_cards/mrm8488/spanbert-base-finetuned-squadv1/README.md
@@ -48,7 +48,7 @@ python code/run_squad.py \
 | SpanBERT (large)        | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1)         | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2)     | 79.6    |  [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred)  |
 
 
-Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
+Note: The numbers marked as * are evaluated on the development sets because those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
 
 ## Model in action
 
diff --git a/model_cards/mrm8488/spanbert-base-finetuned-squadv2/README.md b/model_cards/mrm8488/spanbert-base-finetuned-squadv2/README.md
index f4ff39517c..865c66c8b7 100644
--- a/model_cards/mrm8488/spanbert-base-finetuned-squadv2/README.md
+++ b/model_cards/mrm8488/spanbert-base-finetuned-squadv2/README.md
@@ -54,7 +54,7 @@ python code/run_squad.py \
 | SpanBERT (large)        | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1)          | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2)     | 79.6    |  [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred)  |
 
 
-Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
+Note: The numbers marked as * are evaluated on the development sets because those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
 
 ## Model in action
 
diff --git a/model_cards/mrm8488/spanbert-base-finetuned-tacred/README.md b/model_cards/mrm8488/spanbert-base-finetuned-tacred/README.md
index 199fe0c955..3ee1158b26 100644
--- a/model_cards/mrm8488/spanbert-base-finetuned-tacred/README.md
+++ b/model_cards/mrm8488/spanbert-base-finetuned-tacred/README.md
@@ -45,7 +45,7 @@ python code/run_tacred.py \
 | SpanBERT (large)        | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1)        | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2)     | 79.6    |  [70.8](https://huggingface.co/mrm8488/spanbert-base-finetuned-tacred)   |
 
 
-Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
+Note: The numbers marked as * are evaluated on the development sets because those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
 
 
 > Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
diff --git a/model_cards/mrm8488/spanbert-large-finetuned-squadv1/README.md b/model_cards/mrm8488/spanbert-large-finetuned-squadv1/README.md
index 0ae2473f2a..3bfc6ef42f 100644
--- a/model_cards/mrm8488/spanbert-large-finetuned-squadv1/README.md
+++ b/model_cards/mrm8488/spanbert-large-finetuned-squadv1/README.md
@@ -48,7 +48,7 @@ python code/run_squad.py \
 | SpanBERT (large)        | **94.6** (this)         | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2)     | 79.6    |  [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred)  |
 
 
-Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
+Note: The numbers marked as * are evaluated on the development sets because those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
 
 ## Model in action
 
diff --git a/model_cards/mrm8488/spanbert-large-finetuned-squadv2/README.md b/model_cards/mrm8488/spanbert-large-finetuned-squadv2/README.md
index 1edfd62d0f..51b9d3ae79 100644
--- a/model_cards/mrm8488/spanbert-large-finetuned-squadv2/README.md
+++ b/model_cards/mrm8488/spanbert-large-finetuned-squadv2/README.md
@@ -54,7 +54,7 @@ python code/run_squad.py \
 | SpanBERT (large)        | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1)          | **88.7** (this)     | 79.6    |  [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred)  |
 
 
-Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
+Note: The numbers marked as * are evaluated on the development sets because those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
 
 ## Model in action
 
diff --git a/model_cards/mrm8488/spanbert-large-finetuned-tacred/README.md b/model_cards/mrm8488/spanbert-large-finetuned-tacred/README.md
index 0a11f44f33..826c69be1c 100644
--- a/model_cards/mrm8488/spanbert-large-finetuned-tacred/README.md
+++ b/model_cards/mrm8488/spanbert-large-finetuned-tacred/README.md
@@ -45,7 +45,7 @@ python code/run_tacred.py \
 | SpanBERT (large)        | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1)        | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2)     | 79.6    |  **70.8** (this one)  |
 
 
-Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
+Note: The numbers marked as * are evaluated on the development sets because those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
 
 
 > Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
diff --git a/model_cards/mrm8488/t5-base-finetuned-wikiSQL-sql-to-en/README.md b/model_cards/mrm8488/t5-base-finetuned-wikiSQL-sql-to-en/README.md
index 00003c13a3..73932f1f0b 100644
--- a/model_cards/mrm8488/t5-base-finetuned-wikiSQL-sql-to-en/README.md
+++ b/model_cards/mrm8488/t5-base-finetuned-wikiSQL-sql-to-en/README.md
@@ -50,7 +50,7 @@ tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL-sql
 model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL-sql-to-en")
 
 def get_explanation(query):
-  input_text = "translante Sql to English: %s </s>" % query
+  input_text = "translate Sql to English: %s </s>" % query
   features = tokenizer([input_text], return_tensors='pt')
 
   output = model.generate(input_ids=features['input_ids'], 
diff --git a/model_cards/mrm8488/t5-base-finetuned-wikiSQL/README.md b/model_cards/mrm8488/t5-base-finetuned-wikiSQL/README.md
index 59ea029683..3e2b46cf6c 100644
--- a/model_cards/mrm8488/t5-base-finetuned-wikiSQL/README.md
+++ b/model_cards/mrm8488/t5-base-finetuned-wikiSQL/README.md
@@ -50,7 +50,7 @@ tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")
 model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")
 
 def get_sql(query):
-  input_text = "translante English to SQL: %s </s>" % query
+  input_text = "translate English to SQL: %s </s>" % query
   features = tokenizer([input_text], return_tensors='pt')
 
   output = model.generate(input_ids=features['input_ids'], 
diff --git a/model_cards/mrm8488/t5-small-finetuned-wikiSQL/README.md b/model_cards/mrm8488/t5-small-finetuned-wikiSQL/README.md
index 147e1a6b2a..ebdab18dd6 100644
--- a/model_cards/mrm8488/t5-small-finetuned-wikiSQL/README.md
+++ b/model_cards/mrm8488/t5-small-finetuned-wikiSQL/README.md
@@ -50,7 +50,7 @@ tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-small-finetuned-wikiSQL")
 model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-small-finetuned-wikiSQL")
 
 def get_sql(query):
-  input_text = "translante English to SQL: %s </s>" % query
+  input_text = "translate English to SQL: %s </s>" % query
   features = tokenizer([input_text], return_tensors='pt')
 
   output = model.generate(input_ids=features['input_ids'], 
diff --git a/model_cards/mrm8488/xlm-multi-finetuned-xquadv1/README.md b/model_cards/mrm8488/xlm-multi-finetuned-xquadv1/README.md
index 629c945a29..72ba3320b3 100644
--- a/model_cards/mrm8488/xlm-multi-finetuned-xquadv1/README.md
+++ b/model_cards/mrm8488/xlm-multi-finetuned-xquadv1/README.md
@@ -71,7 +71,7 @@ Citation:
 
 </details>
 
-As XQuAD is just an evaluation dataset, I used Data augmentation techniques (scraping, neural machine translation, etc) to obtain more samples and splited the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
+As XQuAD is just an evaluation dataset, I used Data augmentation techniques (scraping, neural machine translation, etc) to obtain more samples and split the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
 
 | Dataset     | # samples |
 | ----------- | --------- |
diff --git a/src/transformers/benchmark/benchmark_utils.py b/src/transformers/benchmark/benchmark_utils.py
index 908c511298..7a9f538eeb 100644
--- a/src/transformers/benchmark/benchmark_utils.py
+++ b/src/transformers/benchmark/benchmark_utils.py
@@ -172,7 +172,7 @@ class MemorySummary(NamedTuple):
     `MemorySummary` namedtuple otherwise with the fields:
 
         - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by
-          substracting the memory after executing each line from the memory before executing said line.
+          subtracting the memory after executing each line from the memory before executing said line.
         - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
           obtained by summing repeated memory increase for a line if it's executed several times. The list is sorted
           from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory
@@ -208,7 +208,7 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_i
 
     Returns:
 
-        - `max_memory`: (`int`) cosumed memory peak in Bytes
+        - `max_memory`: (`int`) consumed memory peak in Bytes
     """
 
     def get_cpu_memory(process_id: int) -> int:
@@ -221,7 +221,7 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_i
 
         Returns
 
-            - `memory`: (`int`) cosumed memory in Bytes
+            - `memory`: (`int`) consumed memory in Bytes
         """
         process = psutil.Process(process_id)
         try:
@@ -367,7 +367,7 @@ def start_memory_tracing(
             devices = list(range(nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace
             nvml.nvmlShutdown()
         except (OSError, nvml.NVMLError):
-            logger.warning("Error while initializing comunication with GPU. " "We won't perform GPU memory tracing.")
+            logger.warning("Error while initializing communication with GPU. " "We won't perform GPU memory tracing.")
             log_gpu = False
         else:
             log_gpu = is_torch_available() or is_tf_available()
@@ -472,9 +472,10 @@ def stop_memory_tracing(
 
     Args:
 
-        - `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
-        - `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total
-          memory
+        `memory_trace` (optional output of start_memory_tracing, default: None):
+            memory trace to convert in summary
+        `ignore_released_memory` (boolean, default: None):
+            if True we only sum memory increase to compute total memory
 
     Return:
 
@@ -482,7 +483,7 @@ def stop_memory_tracing(
         - `MemorySummary` namedtuple otherwise with the fields:
 
             - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by
-              substracting the memory after executing each line from the memory before executing said line.
+              subtracting the memory after executing each line from the memory before executing said line.
             - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each
               line obtained by summing repeated memory increase for a line if it's executed several times. The list is
               sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative
diff --git a/src/transformers/commands/convert.py b/src/transformers/commands/convert.py
index 3dfa61de96..1e054b6a30 100644
--- a/src/transformers/commands/convert.py
+++ b/src/transformers/commands/convert.py
@@ -41,7 +41,7 @@ class ConvertCommand(BaseTransformersCLICommand):
             "--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder."
         )
         train_parser.add_argument(
-            "--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch savd model output."
+            "--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch saved model output."
         )
         train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.")
         train_parser.add_argument(
diff --git a/src/transformers/configuration_bart.py b/src/transformers/configuration_bart.py
index 2fe8025438..a5f79f33d1 100644
--- a/src/transformers/configuration_bart.py
+++ b/src/transformers/configuration_bart.py
@@ -61,7 +61,7 @@ class BartConfig(PretrainedConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
         dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
diff --git a/src/transformers/configuration_bert.py b/src/transformers/configuration_bert.py
index 78f7621ccb..8c9ec766d1 100644
--- a/src/transformers/configuration_bert.py
+++ b/src/transformers/configuration_bert.py
@@ -76,7 +76,7 @@ class BertConfig(PretrainedConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
diff --git a/src/transformers/configuration_bert_generation.py b/src/transformers/configuration_bert_generation.py
index b41c8ca24f..0342d4909c 100644
--- a/src/transformers/configuration_bert_generation.py
+++ b/src/transformers/configuration_bert_generation.py
@@ -42,7 +42,7 @@ class BertGenerationConfig(PretrainedConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
@@ -62,7 +62,7 @@ class BertGenerationConfig(PretrainedConfig):
         >>> # Initializing a BertGeneration config
         >>> configuration = BertGenerationConfig()
 
-        >>> # Initializing a modelfrom the config
+        >>> # Initializing a model from the config
         >>> model = BertGenerationEncoder(configuration)
 
         >>> # Accessing the model configuration
diff --git a/src/transformers/configuration_blenderbot.py b/src/transformers/configuration_blenderbot.py
index c11150bede..ef9b97db5a 100644
--- a/src/transformers/configuration_blenderbot.py
+++ b/src/transformers/configuration_blenderbot.py
@@ -58,7 +58,7 @@ class BlenderbotConfig(BartConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
         dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
diff --git a/src/transformers/configuration_deberta.py b/src/transformers/configuration_deberta.py
index 25181ea515..e305784e84 100644
--- a/src/transformers/configuration_deberta.py
+++ b/src/transformers/configuration_deberta.py
@@ -55,7 +55,7 @@ class DebertaConfig(PretrainedConfig):
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
             :obj:`"mish"`, :obj:`"linear"`, :obj:`"sigmoid"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
diff --git a/src/transformers/configuration_distilbert.py b/src/transformers/configuration_distilbert.py
index 256994a4df..42a6eae22e 100644
--- a/src/transformers/configuration_distilbert.py
+++ b/src/transformers/configuration_distilbert.py
@@ -61,7 +61,7 @@ class DistilBertConfig(PretrainedConfig):
         hidden_dim (:obj:`int`, `optional`, defaults to 3072):
             The size of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
         dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
diff --git a/src/transformers/configuration_dpr.py b/src/transformers/configuration_dpr.py
index a3b3c85b76..b079e8a7d6 100644
--- a/src/transformers/configuration_dpr.py
+++ b/src/transformers/configuration_dpr.py
@@ -57,7 +57,7 @@ class DPRConfig(PretrainedConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
diff --git a/src/transformers/configuration_electra.py b/src/transformers/configuration_electra.py
index 00bf7a909b..91253f0aef 100644
--- a/src/transformers/configuration_electra.py
+++ b/src/transformers/configuration_electra.py
@@ -62,7 +62,7 @@ class ElectraConfig(PretrainedConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
diff --git a/src/transformers/configuration_flaubert.py b/src/transformers/configuration_flaubert.py
index 81202cd627..64c02c21a8 100644
--- a/src/transformers/configuration_flaubert.py
+++ b/src/transformers/configuration_flaubert.py
@@ -59,11 +59,11 @@ class FlaubertConfig(XLMConfig):
         attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for the attention mechanism
         gelu_activation (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to use a `gelu` actibation instead of `relu`.
+            Whether or not to use a `gelu` activation instead of `relu`.
         sinusoidal_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
             Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.
         causal (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the model shoul behave in a causal manner. Causal models use a triangular attention mask in
+            Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in
             order to only attend to the left-side context instead if a bidirectional context.
         asm (:obj:`bool`, `optional`, defaults to :obj:`False`):
             Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction
diff --git a/src/transformers/configuration_fsmt.py b/src/transformers/configuration_fsmt.py
index 4480826efb..4008d93fb1 100644
--- a/src/transformers/configuration_fsmt.py
+++ b/src/transformers/configuration_fsmt.py
@@ -73,7 +73,7 @@ class FSMTConfig(PretrainedConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
         dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
diff --git a/src/transformers/configuration_funnel.py b/src/transformers/configuration_funnel.py
index 2ba23ec57f..c1b6a284af 100644
--- a/src/transformers/configuration_funnel.py
+++ b/src/transformers/configuration_funnel.py
@@ -68,7 +68,7 @@ class FunnelConfig(PretrainedConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
         hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for the attention probabilities.
         activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
diff --git a/src/transformers/configuration_layoutlm.py b/src/transformers/configuration_layoutlm.py
index 0e1afed595..75e5fe717c 100644
--- a/src/transformers/configuration_layoutlm.py
+++ b/src/transformers/configuration_layoutlm.py
@@ -54,7 +54,7 @@ class LayoutLMConfig(BertConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
diff --git a/src/transformers/configuration_lxmert.py b/src/transformers/configuration_lxmert.py
index 18b7bb862d..0c06d14ebd 100644
--- a/src/transformers/configuration_lxmert.py
+++ b/src/transformers/configuration_lxmert.py
@@ -57,7 +57,7 @@ class LxmertConfig(PretrainedConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
@@ -95,10 +95,9 @@ class LxmertConfig(PretrainedConfig):
             Whether or not to add masked language modeling (as used in pretraining models such as BERT) to the loss
             objective.
         task_obj_predict (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to add object predicition, attribute predicition and feature regression to the loss
-            objective.
+            Whether or not to add object prediction, attribute ppredictionand feature regression to the loss objective.
         task_qa (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to add the question-asnwering loss to the objective
+            Whether or not to add the question-asansweringoss to the objective
         visual_obj_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not to calculate the object-prediction loss objective
         visual_attr_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
@@ -106,10 +105,10 @@ class LxmertConfig(PretrainedConfig):
         visual_feat_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not to calculate the feature-regression loss objective
         output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the model should return the attentions from the vision, langauge, and cross-modality layers
+            Whether or not the model should return the attentions from the vision, language, and cross-modality layers
             should be returned.
         output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the model should return the hidden states from the vision, langauge, and cross-modality
+            Whether or not the model should return the hidden states from the vision, language, and cross-modality
             layers should be returned.
     """
 
diff --git a/src/transformers/configuration_marian.py b/src/transformers/configuration_marian.py
index efeaca451d..8e4e257ce9 100644
--- a/src/transformers/configuration_marian.py
+++ b/src/transformers/configuration_marian.py
@@ -52,7 +52,7 @@ class MarianConfig(BartConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
         dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
diff --git a/src/transformers/configuration_mbart.py b/src/transformers/configuration_mbart.py
index f1af1d5d62..8406236889 100644
--- a/src/transformers/configuration_mbart.py
+++ b/src/transformers/configuration_mbart.py
@@ -57,7 +57,7 @@ class MBartConfig(BartConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
         dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
diff --git a/src/transformers/configuration_pegasus.py b/src/transformers/configuration_pegasus.py
index bc61e64891..8bda4dc114 100644
--- a/src/transformers/configuration_pegasus.py
+++ b/src/transformers/configuration_pegasus.py
@@ -96,7 +96,7 @@ class PegasusConfig(BartConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
         dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
diff --git a/src/transformers/configuration_prophetnet.py b/src/transformers/configuration_prophetnet.py
index 0fb77614a9..3dc2b011ea 100644
--- a/src/transformers/configuration_prophetnet.py
+++ b/src/transformers/configuration_prophetnet.py
@@ -60,7 +60,7 @@ class ProphetNetConfig(PretrainedConfig):
         attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
diff --git a/src/transformers/configuration_rag.py b/src/transformers/configuration_rag.py
index 22fe00006c..eaf353a213 100644
--- a/src/transformers/configuration_rag.py
+++ b/src/transformers/configuration_rag.py
@@ -30,7 +30,7 @@ RAG_CONFIG_DOC = r"""
             Separator inserted between the title and the text of the retrieved document when calling
             :class:`~transformers.RagRetriever`.
         doc_sep (:obj:`str`, `optional`, defaults to  ``" // "``):
-            Separator inserted between the the text of the retrieved document and the original input when calliang
+            Separator inserted between the the text of the retrieved document and the original input when calling
             :class:`~transformers.RagRetriever`.
         n_docs (:obj:`int`, `optional`, defaults to 5):
             Number of documents to retrieve.
@@ -39,7 +39,7 @@ RAG_CONFIG_DOC = r"""
         retrieval_vector_size (:obj:`int`, `optional`, defaults to 768):
             Dimensionality of the document embeddings indexed by :class:`~transformers.RagRetriever`.
         retrieval_batch_size (:obj:`int`, `optional`, defaults to 8):
-            Retrieval batch size, defined as the number of queries issues concurrently to the faiss index excapsulated
+            Retrieval batch size, defined as the number of queries issues concurrently to the faiss index encapsulated
             :class:`~transformers.RagRetriever`.
         dataset (:obj:`str`, `optional`, defaults to :obj:`"wiki_dpr"`):
             A dataset identifier of the indexed dataset in HuggingFace Datasets (list all available datasets and ids
diff --git a/src/transformers/configuration_reformer.py b/src/transformers/configuration_reformer.py
index 2f018ebbac..0ef4b598b7 100755
--- a/src/transformers/configuration_reformer.py
+++ b/src/transformers/configuration_reformer.py
@@ -82,7 +82,7 @@ class ReformerConfig(PretrainedConfig):
             The non-linear activation function (function or string) in the feed forward layer in the residual attention
             block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.05):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         hidden_size (:obj:`int`, `optional`, defaults to 256):
             Dimensionality of the output hidden states of the residual attention blocks.
         initializer_range (:obj:`float`, `optional`, defaults to 0.02):
diff --git a/src/transformers/configuration_retribert.py b/src/transformers/configuration_retribert.py
index 0b902d6c16..36e04faa71 100644
--- a/src/transformers/configuration_retribert.py
+++ b/src/transformers/configuration_retribert.py
@@ -20,7 +20,7 @@ from .utils import logging
 
 logger = logging.get_logger(__name__)
 
-# TODO: uploadto AWS
+# TODO: upload to AWS
 RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "retribert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
 }
@@ -51,7 +51,7 @@ class RetriBertConfig(PretrainedConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
diff --git a/src/transformers/configuration_squeezebert.py b/src/transformers/configuration_squeezebert.py
index b07d435aa3..666c79ab2f 100644
--- a/src/transformers/configuration_squeezebert.py
+++ b/src/transformers/configuration_squeezebert.py
@@ -52,7 +52,7 @@ class SqueezeBertConfig(PretrainedConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
diff --git a/src/transformers/configuration_transfo_xl.py b/src/transformers/configuration_transfo_xl.py
index 603155c6db..ccd88ac44e 100644
--- a/src/transformers/configuration_transfo_xl.py
+++ b/src/transformers/configuration_transfo_xl.py
@@ -77,7 +77,7 @@ class TransfoXLConfig(PretrainedConfig):
         adaptive (:obj:`boolean`, `optional`, defaults to :obj:`True`):
             Whether or not to use adaptive softmax.
         dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         dropatt (:obj:`float`, `optional`, defaults to 0):
             The dropout ratio for the attention probabilities.
         untie_r (:obj:`boolean`, `optional`, defaults to :obj:`True`):
diff --git a/src/transformers/convert_graph_to_onnx.py b/src/transformers/convert_graph_to_onnx.py
index 53eadf57f0..ee13ced6c5 100644
--- a/src/transformers/convert_graph_to_onnx.py
+++ b/src/transformers/convert_graph_to_onnx.py
@@ -83,7 +83,7 @@ def generate_identified_filename(filename: Path, identifier: str) -> Path:
         filename: pathlib.Path The actual path object we would like to add an identifier suffix
         identifier: The suffix to add
 
-    Returns: String with concatenated indentifier at the end of the filename
+    Returns: String with concatenated identifier at the end of the filename
     """
     return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix)
 
diff --git a/src/transformers/convert_longformer_original_pytorch_lightning_to_pytorch.py b/src/transformers/convert_longformer_original_pytorch_lightning_to_pytorch.py
index 248f2d1ed9..48337e9beb 100644
--- a/src/transformers/convert_longformer_original_pytorch_lightning_to_pytorch.py
+++ b/src/transformers/convert_longformer_original_pytorch_lightning_to_pytorch.py
@@ -30,7 +30,7 @@ class LightningModel(pl.LightningModule):
         self.num_labels = 2
         self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels)
 
-    # implement only because lighning requires to do so
+    # implement only because lightning requires to do so
     def forward(self):
         pass
 
@@ -57,7 +57,7 @@ def convert_longformer_qa_checkpoint_to_pytorch(
     # save model
     longformer_for_qa.save_pretrained(pytorch_dump_folder_path)
 
-    print("Conversion succesful. Model saved under {}".format(pytorch_dump_folder_path))
+    print("Conversion successful. Model saved under {}".format(pytorch_dump_folder_path))
 
 
 if __name__ == "__main__":
@@ -75,7 +75,7 @@ if __name__ == "__main__":
         default=None,
         type=str,
         required=True,
-        help="Path the official PyTorch Lighning Checkpoint.",
+        help="Path the official PyTorch Lightning Checkpoint.",
     )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
diff --git a/src/transformers/convert_marian_tatoeba_to_pytorch.py b/src/transformers/convert_marian_tatoeba_to_pytorch.py
index d7f89e8184..6d7333aae7 100644
--- a/src/transformers/convert_marian_tatoeba_to_pytorch.py
+++ b/src/transformers/convert_marian_tatoeba_to_pytorch.py
@@ -34,7 +34,7 @@ class TatoebaConverter:
 
         1. convert numpy state dict to hf format (same code as OPUS-MT-Train conversion).
         2. rename opus model to huggingface format. This means replace each alpha3 code with an alpha2 code if a unique
-           one existes. e.g. aav-eng -> aav-en, heb-eng -> he-en
+           one exists. e.g. aav-eng -> aav-en, heb-eng -> he-en
         3. write a model card containing the original Tatoeba-Challenge/README.md and extra info about alpha3 group
            members.
     """
diff --git a/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py b/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py
index 104075bf4e..631d57df26 100755
--- a/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py
+++ b/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py
@@ -123,7 +123,7 @@ if __name__ == "__main__":
     parser.add_argument(
         "--force_download",
         action="store_true",
-        help="Re-dowload checkpoints.",
+        help="Re-download checkpoints.",
     )
     args = parser.parse_args()
 
diff --git a/src/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py b/src/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
index 5ad56c73b5..f726466b10 100755
--- a/src/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
@@ -104,7 +104,7 @@ if __name__ == "__main__":
         "--finetuning_task",
         default=None,
         type=str,
-        help="Name of a task on which the XLNet TensorFloaw model was fine-tuned",
+        help="Name of a task on which the XLNet TensorFlow model was fine-tuned",
     )
     args = parser.parse_args()
     print(args)
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index 4e9be990b2..c0a41ee838 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -330,7 +330,7 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
         input_ids, labels, attention_mask = self.mask_tokens(input_ids)
 
         token_type_ids = [example["token_type_ids"] for example in examples]
-        # size of segment_ids varied because randomness, padding zero to the end as the orignal implementation
+        # size of segment_ids varied because randomness, padding zero to the end as the original implementation
         token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
 
         sop_label_list = [example["sentence_order_label"] for example in examples]
diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py
index 8aa72c955f..d88eacdf5d 100644
--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@@ -71,7 +71,7 @@ class TextDataset(Dataset):
                         tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
                     )
                 # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
-                # If your dataset is small, first you should loook for a bigger one :-) and second you
+                # If your dataset is small, first you should look for a bigger one :-) and second you
                 # can change this behavior by adding (model specific) padding.
 
                 start = time.time()
diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
index 41daa06e99..89ef2e22b6 100644
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -327,7 +327,7 @@ def squad_convert_examples_to_features(
         padding_strategy: Default to "max_length". Which padding strategy to use
         return_dataset: Default False. Either 'pt' or 'tf'.
             if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset
-        threads: multiple processing threadsa-smi
+        threads: multiple processing threads.
 
 
     Returns:
@@ -527,7 +527,7 @@ def squad_convert_examples_to_features(
 
 class SquadProcessor(DataProcessor):
     """
-    Processor for the SQuAD data set. Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
+    Processor for the SQuAD data set. overridden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
     version 2.0 of SQuAD, respectively.
     """
 
diff --git a/src/transformers/data/processors/utils.py b/src/transformers/data/processors/utils.py
index d8c916fcec..0fb3f40b9c 100644
--- a/src/transformers/data/processors/utils.py
+++ b/src/transformers/data/processors/utils.py
@@ -245,9 +245,6 @@ class SingleSentenceClassificationProcessor(DataProcessor):
         Args:
             tokenizer: Instance of a tokenizer that will tokenize the examples
             max_length: Maximum example length
-            task: GLUE task
-            label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
-            output_mode: String indicating the output mode. Either ``regression`` or ``classification``
             pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
             pad_token: Padding token
             mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index a422c628af..d9f2ec0db6 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -89,7 +89,7 @@ try:
     # Check we're not importing a "datasets" directory somewhere
     _datasets_available = hasattr(datasets, "__version__") and hasattr(datasets, "load_dataset")
     if _datasets_available:
-        logger.debug(f"Succesfully imported datasets version {datasets.__version__}")
+        logger.debug(f"Successfully imported datasets version {datasets.__version__}")
     else:
         logger.debug("Imported a datasets object but this doesn't seem to be the 🤗 datasets library.")
 
@@ -147,7 +147,7 @@ try:
     import faiss  # noqa: F401
 
     _faiss_available = True
-    logger.debug(f"Succesfully imported faiss version {faiss.__version__}")
+    logger.debug(f"Successfully imported faiss version {faiss.__version__}")
 except ImportError:
     _faiss_available = False
 
@@ -290,7 +290,7 @@ def torch_only_method(fn):
 
 # docstyle-ignore
 DATASETS_IMPORT_ERROR = """
-{0} requires the 🤗 Datasets library but it was not found in your enviromnent. You can install it with:
+{0} requires the 🤗 Datasets library but it was not found in your environment. You can install it with:
 ```
 pip install datasets
 ```
@@ -308,7 +308,7 @@ that python file if that's the case.
 
 # docstyle-ignore
 TOKENIZERS_IMPORT_ERROR = """
-{0} requires the 🤗 Tokenizers library but it was not found in your enviromnent. You can install it with:
+{0} requires the 🤗 Tokenizers library but it was not found in your environment. You can install it with:
 ```
 pip install tokenizers
 ```
@@ -321,30 +321,30 @@ In a notebook or a colab, you can install it by executing a cell with
 
 # docstyle-ignore
 SENTENCEPIECE_IMPORT_ERROR = """
-{0} requires the SentencePiece library but it was not found in your enviromnent. Checkout the instructions on the
+{0} requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
 installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
-that match your enviromnent.
+that match your environment.
 """
 
 
 # docstyle-ignore
 FAISS_IMPORT_ERROR = """
-{0} requires the faiss library but it was not found in your enviromnent. Checkout the instructions on the
+{0} requires the faiss library but it was not found in your environment. Checkout the instructions on the
 installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones
-that match your enviromnent.
+that match your environment.
 """
 
 
 # docstyle-ignore
 PYTORCH_IMPORT_ERROR = """
-{0} requires the PyTorch library but it was not found in your enviromnent. Checkout the instructions on the
-installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your enviromnent.
+{0} requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
+installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
 """
 
 
 # docstyle-ignore
 SKLEARN_IMPORT_ERROR = """
-{0} requires the scikit-learn library but it was not found in your enviromnent. You can install it with:
+{0} requires the scikit-learn library but it was not found in your environment. You can install it with:
 ```
 pip install -U scikit-learn
 ```
@@ -357,15 +357,15 @@ In a notebook or a colab, you can install it by executing a cell with
 
 # docstyle-ignore
 TENSORFLOW_IMPORT_ERROR = """
-{0} requires the TensorFlow library but it was not found in your enviromnent. Checkout the instructions on the
-installation page: https://www.tensorflow.org/install and follow the ones that match your enviromnent.
+{0} requires the TensorFlow library but it was not found in your environment. Checkout the instructions on the
+installation page: https://www.tensorflow.org/install and follow the ones that match your environment.
 """
 
 
 # docstyle-ignore
 FLAX_IMPORT_ERROR = """
-{0} requires the FLAX library but it was not found in your enviromnent. Checkout the instructions on the
-installation page: https://github.com/google/flax and follow the ones that match your enviromnent.
+{0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
+installation page: https://github.com/google/flax and follow the ones that match your environment.
 """
 
 
@@ -918,13 +918,13 @@ def cached_path(
 
     Args:
         cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
-        force_download: if True, re-dowload the file even if it's already cached in the cache dir.
-        resume_download: if True, resume the download if incompletly recieved file is found.
+        force_download: if True, re-download the file even if it's already cached in the cache dir.
+        resume_download: if True, resume the download if incompletely received file is found.
         user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
         extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed
             file in a folder along the archive.
         force_extract: if True when extract_compressed_file is True and the archive was already extracted,
-            re-extract the archive and overide the folder where it was extracted.
+            re-extract the archive and override the folder where it was extracted.
 
     Return:
         None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). Local path (string)
diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
index 509d60e17d..71d8c6deb9 100644
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -25,14 +25,14 @@ logger = logging.get_logger(__name__)
 
 class TFGenerationMixin:
     """
-    A class contraining all of the functions supporting generation, to be used as a mixin in
-    :class:`~transfomers.TFPreTrainedModel`.
+    A class containing all of the functions supporting generation, to be used as a mixin in
+    :class:`~transformers.TFPreTrainedModel`.
     """
 
     def prepare_inputs_for_generation(self, inputs, **kwargs):
         """
-        Implement in subclasses of :class:`~transfomers.TFPreTrainedModel` for custom behavior to prepare inputs in the
-        generate method.
+        Implement in subclasses of :class:`~transformers.TFPreTrainedModel` for custom behavior to prepare inputs in
+        the generate method.
         """
         return {"inputs": inputs}
 
@@ -216,17 +216,17 @@ class TFGenerationMixin:
         )
 
         if input_ids is not None:
-            batch_size = shape_list(input_ids)[0]  # overriden by the input batch_size
+            batch_size = shape_list(input_ids)[0]  # overridden by the input batch_size
         else:
             batch_size = 1
 
-        assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer."
+        assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer."
         assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer."
         assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
         assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean."
         assert isinstance(use_cache, bool), "`use_cache` should be a boolean."
         assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer."
-        assert temperature > 0, "`temperature` should be strictely positive."
+        assert temperature > 0, "`temperature` should be strictly positive."
         assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
         assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
         assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
@@ -239,10 +239,10 @@ class TFGenerationMixin:
         assert (eos_token_id is None) or (
             isinstance(eos_token_id, int) and (eos_token_id >= 0)
         ), "`eos_token_id` should be a positive integer."
-        assert length_penalty > 0, "`length_penalty` should be strictely positive."
+        assert length_penalty > 0, "`length_penalty` should be strictly positive."
         assert (
             isinstance(num_return_sequences, int) and num_return_sequences > 0
-        ), "`num_return_sequences` should be a strictely positive integer."
+        ), "`num_return_sequences` should be a strictly positive integer."
         assert (
             bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list)
         ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated"
@@ -722,7 +722,7 @@ class TFGenerationMixin:
                     beam_scores[:, None], (batch_size * num_beams, vocab_size)
                 )  # (batch_size * num_beams, vocab_size)
 
-                # re-organize to group the beam together (we are keeping top hypothesis accross beams)
+                # re-organize to group the beam together (we are keeping top hypothesis across beams)
                 next_scores = tf.reshape(
                     next_scores, (batch_size, num_beams * vocab_size)
                 )  # (batch_size, num_beams * vocab_size)
@@ -897,7 +897,7 @@ class TFGenerationMixin:
 
     def adjust_logits_during_generation(self, logits, **kwargs):
         """
-        Implement in subclasses of :class:`~transfomers.PreTrainedModel` for custom behavior to adjust the logits in
+        Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to adjust the logits in
         the generate method.
         """
         return logits
@@ -978,7 +978,7 @@ def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids):
 
 def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
     """
-    Filter a distribution of logits using top-k and/or nucleus (top-p) filterin
+    Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
 
     Args:
         logits: logits distribution shape (batch size, vocabulary size)
@@ -1047,7 +1047,7 @@ def set_tensor_by_indices_to_value(tensor, indices, value):
 
 def sample_without_replacement(logits, num_samples):
     """
-    categorical sampling witouth replacement is currently not implemented the gumbel-max trick will do for now see
+    categorical sampling without replacement is currently not implemented the gumbel-max trick will do for now see
     https://github.com/tensorflow/tensorflow/issues/9260 for more info
     """
     z = -tf.math.log(tf.random.uniform(shape_list(logits), 0, 1))
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 6e06a6fdf8..e85166a815 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -29,20 +29,20 @@ logger = logging.get_logger(__name__)
 
 class GenerationMixin:
     """
-    A class contraining all of the functions supporting generation, to be used as a mixin in
-    :class:`~transfomers.PreTrainedModel`.
+    A class containing all of the functions supporting generation, to be used as a mixin in
+    :class:`~transformers.PreTrainedModel`.
     """
 
     def prepare_inputs_for_generation(self, input_ids, **kwargs):
         """
-        Implement in subclasses of :class:`~transfomers.PreTrainedModel` for custom behavior to prepare inputs in the
+        Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to prepare inputs in the
         generate method.
         """
         return {"input_ids": input_ids}
 
     def adjust_logits_during_generation(self, logits, **kwargs):
         """
-        Implement in subclasses of :class:`~transfomers.PreTrainedModel` for custom behavior to adjust the logits in
+        Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to adjust the logits in
         the generate method.
         """
         return logits
@@ -285,7 +285,7 @@ class GenerationMixin:
         )
 
         if input_ids is not None:
-            batch_size = input_ids.shape[0]  # overriden by the input batch_size
+            batch_size = input_ids.shape[0]  # overridden by the input batch_size
         else:
             batch_size = 1
 
@@ -533,7 +533,7 @@ class GenerationMixin:
     ):
         """
         Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated
-        independantly.
+        independently.
         """
         # length of generated sentences / unfinished sentences
         unfinished_sents = input_ids.new(batch_size).fill_(1)
@@ -600,7 +600,7 @@ class GenerationMixin:
                 # unfinished_sents is set to zero if eos in sentence
                 unfinished_sents.mul_((~eos_in_sents).long())
 
-            # stop when there is a </s> in each sentence, or if we exceed the maximul length
+            # stop when there is a </s> in each sentence, or if we exceed the maximum length
             if unfinished_sents.max() == 0:
                 break
 
@@ -724,7 +724,7 @@ class GenerationMixin:
             else:
                 next_scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
 
-                # re-organize to group the beam together (we are keeping top hypothesis accross beams)
+                # re-organize to group the beam together (we are keeping top hypothesis across beams)
                 next_scores = next_scores.view(
                     batch_size, num_beams * vocab_size
                 )  # (batch_size, num_beams * vocab_size)
@@ -969,7 +969,7 @@ def top_k_top_p_filtering(
     min_tokens_to_keep: int = 1,
 ) -> Tensor:
     """
-    Filter a distribution of logits using top-k and/or nucleus (top-p) filterin
+    Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
 
     Args:
         logits: logits distribution shape (batch size, vocabulary size)
diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index 726b910d3b..da3f88f457 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -49,7 +49,7 @@ class ModelCard:
     """
 
     def __init__(self, **kwargs):
-        # Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers)
+        # Recommended attributes from https://arxiv.org/abs/1810.03993 (see papers)
         self.model_details = kwargs.pop("model_details", {})
         self.intended_use = kwargs.pop("intended_use", {})
         self.factors = kwargs.pop("factors", {})
diff --git a/src/transformers/modeling_auto.py b/src/transformers/modeling_auto.py
index 267ba9927d..fad40db2fa 100644
--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -488,7 +488,7 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
             model_args (additional positional arguments, `optional`):
                 Will be passed along to the underlying model ``__init__()`` method.
             config (:class:`~transformers.PretrainedConfig`, `optional`):
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can
+                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
                 be automatically loaded when:
 
                     - The model is a model provided by the library (loaded with the `shortcut name` string of a
@@ -522,7 +522,7 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
             output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
                 Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
             local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to only look at local files (e.g., not try doanloading the model).
+                Whether or not to only look at local files (e.g., not try downloading the model).
             use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
                 Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on
                 our S3 (faster). Should be set to :obj:`False` for checkpoints larger than 20GB.
@@ -1424,7 +1424,7 @@ class AutoModelForTokenClassification:
 class AutoModelForMultipleChoice:
     r"""
     This is a generic model class that will be instantiated as one of the model classes of the library---with a
-    multiple choice classifcation head---when created with the when created with the
+    multiple choice classification head---when created with the when created with the
     :meth:`~transformers.AutoModelForMultipleChoice.from_pretrained` class method or the
     :meth:`~transformers.AutoModelForMultipleChoice.from_config` class method.
 
diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 08c95fb6bf..5ed10043f1 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -906,7 +906,7 @@ class BartModel(PretrainedBartModel):
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOuput when return_dict=False
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
         elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
             encoder_outputs = BaseModelOutput(
                 last_hidden_state=encoder_outputs[0],
diff --git a/src/transformers/modeling_deberta.py b/src/transformers/modeling_deberta.py
index c0da8073f8..a05b65e395 100644
--- a/src/transformers/modeling_deberta.py
+++ b/src/transformers/modeling_deberta.py
@@ -69,8 +69,8 @@ class XSoftmax(torch.autograd.Function):
 
     Args:
       input (:obj:`torch.tensor`): The input tensor that will apply softmax.
-      mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax caculation.
-      dim (int): The dimenssion that will apply softmax
+      mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
+      dim (int): The dimension that will apply softmax
 
     Example::
       import torch
@@ -540,16 +540,16 @@ class DisentangledSelfAttention(torch.nn.Module):
 
         Args:
             hidden_states (:obj:`torch.FloatTensor`):
-                Input states to the module usally the output from previous layer, it will be the Q,K and V in
+                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                 `Attention(Q,K,V)`
 
             attention_mask (:obj:`torch.ByteTensor`):
-                An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maxium
+                An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maximum
                 sequence length in which element [i,j] = `1` means the `i` th token in the input can attend to the `j`
                 th token.
 
             return_att (:obj:`bool`, optional):
-                Whether return the attention maxitrix.
+                Whether return the attention matrix.
 
             query_states (:obj:`torch.FloatTensor`, optional):
                 The `Q` state in `Attention(Q,K,V)`.
@@ -627,7 +627,7 @@ class DisentangledSelfAttention(torch.nn.Module):
             relative_pos = relative_pos.unsqueeze(1)
         # bxhxqxk
         elif relative_pos.dim() != 4:
-            raise ValueError(f"Relative postion ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
+            raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
 
         att_span = min(max(query_layer.size(-2), key_layer.size(-2)), self.max_relative_positions)
         relative_pos = relative_pos.long().to(query_layer.device)
@@ -772,7 +772,7 @@ DEBERTA_START_DOCSTRING = r"""
     The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
     <https://arxiv.org/abs/2006.03654>`_ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
     BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
-    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pre-trianing data.
+    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pre-training data.
 
     This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
     subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py
index d8ee191a09..20837a938d 100755
--- a/src/transformers/modeling_distilbert.py
+++ b/src/transformers/modeling_distilbert.py
@@ -290,7 +290,7 @@ class Transformer(nn.Module):
             attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence.
 
         Returns:
-            hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hiddens states in the last (top)
+            hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hidden states in the last (top)
             layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
                 Tuple of length n_layers with the hidden states from each layer.
                 Optional: only if output_hidden_states=True
diff --git a/src/transformers/modeling_dpr.py b/src/transformers/modeling_dpr.py
index 0de028d303..9f365304a4 100644
--- a/src/transformers/modeling_dpr.py
+++ b/src/transformers/modeling_dpr.py
@@ -418,7 +418,7 @@ DPR_READER_INPUTS_DOCSTRING = r"""
             Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
             tensors for more detail.
         output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to rturn the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
             more detail.
         return_dict (:obj:`bool`, `optional`):
             Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
diff --git a/src/transformers/modeling_encoder_decoder.py b/src/transformers/modeling_encoder_decoder.py
index 6d6e3f788b..8efd43f555 100644
--- a/src/transformers/modeling_encoder_decoder.py
+++ b/src/transformers/modeling_encoder_decoder.py
@@ -30,7 +30,7 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "EncoderDecoderConfig"
 
 ENCODER_DECODER_START_DOCSTRING = r"""
-    This class can be used to inialize a sequence-to-sequnece model with any pretrained autoencoding model as the
+    This class can be used to initialize a sequence-tsequencece model with any pretrained autoencoding model as the
     encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
     :meth:`~transformers.AutoModel.from_pretrained` function and the decoder is loaded via
     :meth:`~transformers.AutoModelForCausalLM.from_pretrained` function. Cross-attention layers are automatically added
diff --git a/src/transformers/modeling_flaubert.py b/src/transformers/modeling_flaubert.py
index b8d23cab53..4b90bbc231 100644
--- a/src/transformers/modeling_flaubert.py
+++ b/src/transformers/modeling_flaubert.py
@@ -99,7 +99,7 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
             `What are position IDs? <../glossary.html#position-ids>`_
         lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
             Length of each sentence that can be used to avoid performing attention on padding token indices. You can
-            also use :obj:`attention_mask` for the same result (see above), kept here for compatbility. Indices
+            also use :obj:`attention_mask` for the same result (see above), kept here for compatibility. Indices
             selected in ``[0, ..., input_ids.size(-1)]``:
         cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`):
             Dictionary strings to ``torch.FloatTensor`` that contains precomputed hidden-states (key and values in the
diff --git a/src/transformers/modeling_flax_auto.py b/src/transformers/modeling_flax_auto.py
index 0d0f9fc503..8a4be34732 100644
--- a/src/transformers/modeling_flax_auto.py
+++ b/src/transformers/modeling_flax_auto.py
@@ -124,18 +124,18 @@ class FlaxAutoModel(object):
                 All remaining positional arguments will be passed to the underlying model's ``__init__`` method
 
             config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can
+                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
                 be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a
                   pretrained model), or
                 - the model was saved using :func:`~transformers.FlaxPreTrainedModel.save_pretrained` and is reloaded
-                  by suppling the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a
+                  by supplying the save directory.
+                - the model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
                   configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved
+                an optional state dictionary for the model to use instead of a state dictionary loaded from saved
                 weights file. This option can be used if you want to create a model from a pretrained configuration but
                 load your own weights. In this case though, you should check if using
                 :func:`~transformers.FlaxPreTrainedModel.save_pretrained` and
@@ -150,14 +150,14 @@ class FlaxAutoModel(object):
                 they exists.
 
             resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
+                Do not delete incompletely received file. Attempt to resume the download if such a file exists.
 
             proxies: (`optional`) dict, default None:
                 A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request.
 
             output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error
+                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error
                 messages.
 
             kwargs: (`optional`) Remaining dictionary of keyword arguments:
diff --git a/src/transformers/modeling_flax_roberta.py b/src/transformers/modeling_flax_roberta.py
index 48c5e0281c..eea705f3cd 100644
--- a/src/transformers/modeling_flax_roberta.py
+++ b/src/transformers/modeling_flax_roberta.py
@@ -64,7 +64,7 @@ ROBERTA_INPUTS_DOCSTRING = r"""
             Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
 
             - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **maked**.
+            - 0 for tokens that are **masked**.
 
             `What are attention masks? <../glossary.html#attention-mask>`__
         token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
diff --git a/src/transformers/modeling_funnel.py b/src/transformers/modeling_funnel.py
index a96b62ba04..3cb1a5eeaa 100644
--- a/src/transformers/modeling_funnel.py
+++ b/src/transformers/modeling_funnel.py
@@ -226,7 +226,7 @@ class FunnelAttentionStructure(nn.Module):
         d_model = self.config.d_model
         if self.config.attention_type == "factorized":
             # Notations from the paper, appending A.2.2, final formula.
-            # We need to create and return the matrics phi, psi, pi and omega.
+            # We need to create and return the matrices phi, psi, pi and omega.
             pos_seq = torch.arange(0, seq_len, 1.0, dtype=dtype, device=device)
             freq_seq = torch.arange(0, d_model // 2, 1.0, dtype=dtype, device=device)
             inv_freq = 1 / (10000 ** (freq_seq / (d_model // 2)))
@@ -1226,7 +1226,7 @@ class FunnelForMaskedLM(FunnelPreTrainedModel):
 
 @add_start_docstrings(
     """
-    Funnel Transfprmer Model with a sequence classification/regression head on top (two linear layer on top of the
+    Funnel Transformer Model with a sequence classification/regression head on top (two linear layer on top of the
     first timestep of the last hidden state) e.g. for GLUE tasks.
     """,
     FUNNEL_START_DOCSTRING,
diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py
index 22c7bd5919..030ac24edb 100644
--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@@ -588,7 +588,7 @@ class GPT2Model(GPT2PreTrainedModel):
             attention_mask = (1.0 - attention_mask) * -10000.0
 
         # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if self.config.add_cross_attention and encoder_hidden_states is not None:
             encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
@@ -708,7 +708,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         position_ids = kwargs.get("position_ids", None)
 
         if attention_mask is not None and position_ids is None:
-            # create postion_ids on the fly for batch generation
+            # create position_ids on the fly for batch generation
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past:
@@ -1050,7 +1050,7 @@ class GPT2ForSequenceClassification(GPT2PreTrainedModel):
                 sequence_lengths = -1
                 logger.warning(
                     f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
-                    f"unexpected if using padding tokens in conjuction with `inputs_embeds.`"
+                    f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
                 )
 
         pooled_logits = logits[range(batch_size), sequence_lengths]
diff --git a/src/transformers/modeling_longformer.py b/src/transformers/modeling_longformer.py
index 2a2f7e1634..f32d0a2f1d 100755
--- a/src/transformers/modeling_longformer.py
+++ b/src/transformers/modeling_longformer.py
@@ -382,7 +382,7 @@ class LongformerSelfAttention(nn.Module):
                 # batch_size x num_heads x max_num_global_attention_tokens x sequence_length
                 # which is the attention weights from tokens with global attention to all tokens
                 # It doesn't not return local attention
-                # In case of variable number of global attantion in the rows of a batch,
+                # In case of variable number of global attention in the rows of a batch,
                 # attn_probs are padded with -10000.0 attention scores
                 attn_probs = attn_probs.view(batch_size, self.num_heads, max_num_global_attn_indices, seq_len)
             else:
@@ -416,7 +416,7 @@ class LongformerSelfAttention(nn.Module):
                                        -0.7584,  0.4206, -0.0405,  0.1599,
                                        2.0514, -1.1600,  0.5372,  0.2629 ]
               window_overlap = num_rows = 4
-             (pad & diagonilize) =>
+             (pad & diagonalize) =>
              [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
                0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
                0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
@@ -440,7 +440,7 @@ class LongformerSelfAttention(nn.Module):
 
     @staticmethod
     def _chunk(hidden_states, window_overlap):
-        """convert into overlapping chunkings. Chunk size = 2w, overlap size = w"""
+        """convert into overlapping chunks. Chunk size = 2w, overlap size = w"""
 
         # non-overlapping chunks of size = 2w
         hidden_states = hidden_states.view(
@@ -491,7 +491,7 @@ class LongformerSelfAttention(nn.Module):
         chunked_query = self._chunk(query, window_overlap)
         chunked_key = self._chunk(key, window_overlap)
 
-        # matrix multipication
+        # matrix multiplication
         # bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
         # bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim
         # bcxy: batch_size * num_heads x chunks x 2window_overlap x window_overlap
@@ -1030,7 +1030,7 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
 
             `What are attention masks? <../glossary.html#attention-mask>`__
         global_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to decide the attention given on each token, local attention or global attenion. Tokens with global
+            Mask to decide the attention given on each token, local attention or global attention. Tokens with global
             attention attends to all other tokens, and all other tokens attend to them. This is important for
             task-specific finetuning because it makes the model more flexible at representing the task. For example,
             for classification, the <s> token should be given global attention. For QA, all question tokens should also
diff --git a/src/transformers/modeling_lxmert.py b/src/transformers/modeling_lxmert.py
index e917c916f4..8dd58cb9ae 100644
--- a/src/transformers/modeling_lxmert.py
+++ b/src/transformers/modeling_lxmert.py
@@ -58,7 +58,7 @@ class GeLU(nn.Module):
 @dataclass
 class LxmertModelOutput(ModelOutput):
     """
-    Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilites for the language,
+    Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilities for the language,
     visual, and, cross-modality encoders. (note: the visual encoder in Lxmert is referred to as the "relation-ship"
     encoder")
 
@@ -405,7 +405,7 @@ class LxmertSelfAttentionLayer(nn.Module):
         self.output = LxmertAttentionOutput(config)
 
     def forward(self, input_tensor, attention_mask, output_attentions=False):
-        # Self attention attends to itself, thus keys and querys are the same (input_tensor).
+        # Self attention attends to itself, thus keys and queries are the same (input_tensor).
         output = self.self(
             input_tensor,
             input_tensor,
@@ -799,7 +799,7 @@ LXMERT_START_DOCSTRING = r"""
     <https://arxiv.org/abs/1908.07490>`__ by Hao Tan and Mohit Bansal. It's a vision and language transformer model,
     pretrained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MCSCOCO captions, and Visual genome,
     using a combination of masked language modeling, region of interest feature regression, cross entropy loss for
-    question answering attribute prediction, and object tag predicition.
+    question answering attribute prediction, and object tag prediction.
 
     This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
     methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
@@ -1076,12 +1076,10 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
         will add newly initialized weights. Reducing the size will remove weights from the end
 
         Args:
-            cur_qa_logit_layer (:obj:`torch.nn.Linear`):
-                Old linear layer to be resized.
             num_labels (:obj:`int`, `optional`):
                 New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
                 weights at the end. Reducing the size will remove weights from the end. If not provided or :obj:`None`,
-                just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model wihtout doing
+                just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model without doing
                 anything.
 
         Return:
@@ -1298,12 +1296,10 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel):
         will add newly initialized weights. Reducing the size will remove weights from the end
 
         Args:
-            cur_qa_logit_layer (:obj:`torch.nn.Linear`):
-                Old linear layer to be resized.
             num_labels (:obj:`int`, `optional`):
                 New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
                 weights at the end. Reducing the size will remove weights from the end. If not provided or :obj:`None`,
-                just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model wihtout doing
+                just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model without doing
                 anything.
 
         Return:
diff --git a/src/transformers/modeling_mobilebert.py b/src/transformers/modeling_mobilebert.py
index 8d744b4258..e7aa16a1ce 100644
--- a/src/transformers/modeling_mobilebert.py
+++ b/src/transformers/modeling_mobilebert.py
@@ -887,7 +887,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
         )
 
         # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if self.config.is_decoder and encoder_hidden_states is not None:
             encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
diff --git a/src/transformers/modeling_rag.py b/src/transformers/modeling_rag.py
index c5809e1436..a203511dcf 100644
--- a/src/transformers/modeling_rag.py
+++ b/src/transformers/modeling_rag.py
@@ -40,7 +40,7 @@ class RetrievAugLMMarginOutput(ModelOutput):
 
     Args:
         loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Languaged modeling loss.
+            Language modeling loss.
         logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
             Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
             each vocabulary token.
@@ -413,7 +413,7 @@ RAG_FORWARD_INPUTS_DOCSTRING = r"""
 
             Used by the (:class:`~transformers.RagModel`) model during decoding.
         decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
-            Provide for generation tasks. `None` by default, constuct as per instructions for the generator model
+            Provide for generation tasks. `None` by default, construct as per instructions for the generator model
             you're using with your RAG instance.
         decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size,  target_sequence_length)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
@@ -424,7 +424,7 @@ RAG_FORWARD_INPUTS_DOCSTRING = r"""
             :obj:`past_key_values` are used in the (:class:`~transformers.RagTokenForGeneration`) model during
             decoding.
         doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
-            Score between each retrieved document embeddigs (see :obj:`retrieved_doc_embeds`) and
+            Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
             :obj:`question_encoder_last_hidden_state`. If the model has is not initialized with a ``retriever``
             :obj:`doc_scores` has to be provided to the forward pass. :obj:`doc_scores` can be computed via
             :obj:`question_encoder_last_hidden_state` and :obj:`retrieved_doc_embeds`, see examples for more
@@ -660,7 +660,7 @@ class RagModel(RagPreTrainedModel):
 
 @add_start_docstrings_to_model_forward(
     """
-    A RAG-sequence model impementation. It performs RAG-sequence specific marginalization in the forward pass.
+    A RAG-sequence model implementation. It performs RAG-sequence specific marginalization in the forward pass.
     """,
     RAG_START_DOCSTRING,
 )
@@ -736,7 +736,7 @@ class RagSequenceForGeneration(RagPreTrainedModel):
             >>> input_ids = input_dict["input_ids"]
             >>> outputs = model(input_ids=input_ids, labels=input_dict["labels"])
 
-            >>> # or use retriever seperately
+            >>> # or use retriever separately
             >>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
             >>> # 1. Encode
             >>> question_hidden_states = model.question_encoder(input_ids)[0]
@@ -940,13 +940,13 @@ class RagSequenceForGeneration(RagPreTrainedModel):
         )  # batch_size x n_docs x tgt_len x dim
         doc_logprobs = torch.nn.functional.log_softmax(doc_scores, dim=1).unsqueeze(-1).unsqueeze(-1)
 
-        # RAG-sequence marginaliation
+        # RAG-sequence marginalization
         first_token_scores = seq_logprobs[:, :, :1, :]
         second_token_scores = seq_logprobs[:, :, 1:2, :]
         remainder = seq_logprobs[:, :, 2:, :]
         rag_logprobs = torch.cat([first_token_scores, second_token_scores + doc_logprobs, remainder], dim=2)
 
-        # calcualate loss
+        # calculate loss
         target = target.unsqueeze(1).unsqueeze(-1).repeat(1, n_docs, 1, 1)
         assert target.dim() == rag_logprobs.dim()
 
@@ -986,7 +986,7 @@ class RagSequenceForGeneration(RagPreTrainedModel):
 
 @add_start_docstrings_to_model_forward(
     """
-    A RAG-token model impementation. It performs RAG-token specific marginalization in the forward pass.
+    A RAG-token model implementation. It performs RAG-token specific marginalization in the forward pass.
     """,
     RAG_START_DOCSTRING,
 )
@@ -1129,7 +1129,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
             >>> input_ids = input_dict["input_ids"]
             >>> outputs = model(input_ids=input_ids, labels=input_dict["labels"])
 
-            >>> # or use retriever seperately
+            >>> # or use retriever separately
             >>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
             >>> # 1. Encode
             >>> question_hidden_states = model.question_encoder(input_ids)[0]
@@ -1257,7 +1257,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
                 to the forward pass. :obj:`context_input_ids` are returned by
                 :meth:`~transformers.RagRetriever.__call__`.
             doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
-                Score between each retrieved document embeddigs (see :obj:`retrieved_doc_embeds`) and
+                Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
                 :obj:`question_encoder_last_hidden_state`.
 
                 If the model has is not initialized with a ``retriever``, :obj:`context_input_ids` has to be provided
diff --git a/src/transformers/modeling_reformer.py b/src/transformers/modeling_reformer.py
index c31a92087d..3110c591f5 100755
--- a/src/transformers/modeling_reformer.py
+++ b/src/transformers/modeling_reformer.py
@@ -986,7 +986,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
 class ReverseSort(Function):
     """
     After chunked attention is applied which sorted clusters, original ordering has to be restored. Since customized
-    backward function is used for Reformer, the gradients of the output vectors have to be explicitely sorted here.
+    backward function is used for Reformer, the gradients of the output vectors have to be explicitly sorted here.
     """
 
     @staticmethod
@@ -2075,7 +2075,7 @@ class ReformerModel(ReformerPreTrainedModel):
                 device=device,
             )
 
-        # start index for postion encoding depends on incremental decoding
+        # start index for position encoding depends on incremental decoding
         if past_buckets_states is not None:
             start_idx_pos_encodings = past_buckets_states[0][1].shape[1]
         else:
diff --git a/src/transformers/modeling_retribert.py b/src/transformers/modeling_retribert.py
index 69c259b4d3..7801f34a8d 100644
--- a/src/transformers/modeling_retribert.py
+++ b/src/transformers/modeling_retribert.py
@@ -79,7 +79,7 @@ RETRIBERT_START_DOCSTRING = r"""
 
 
 @add_start_docstrings(
-    """Bert Based model to embed queries or document for document retreival. """,
+    """Bert Based model to embed queries or document for document retrieval. """,
     RETRIBERT_START_DOCSTRING,
 )
 class RetriBertModel(RetriBertPreTrainedModel):
@@ -117,7 +117,7 @@ class RetriBertModel(RetriBertPreTrainedModel):
                 attention_mask, input_shape, device
             )
 
-            # define function for cehckpointing
+            # define function for checkpointing
             def partial_encode(*inputs):
                 encoder_outputs = sent_encoder.encoder(
                     inputs[0],
@@ -200,7 +200,7 @@ class RetriBertModel(RetriBertPreTrainedModel):
 
         Return:
             :obj:`torch.FloatTensor`: The bidirectional cross-entropy loss obtained while trying to match each query to
-            its corresponding document and each cocument to its corresponding query in the batch
+            its corresponding document and each document to its corresponding query in the batch
         """
         device = input_ids_query.device
         q_reps = self.embed_questions(input_ids_query, attention_mask_query, checkpoint_batch_size)
diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py
index 38712c43a8..f71acf1492 100644
--- a/src/transformers/modeling_t5.py
+++ b/src/transformers/modeling_t5.py
@@ -44,7 +44,7 @@ _CONFIG_FOR_DOC = "T5Config"
 _TOKENIZER_FOR_DOC = "T5Tokenizer"
 
 ####################################################
-# This dict contrains shortcut names and associated url
+# This dict contains shortcut names and associated url
 # for the pretrained weights provided with the models
 ####################################################
 T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -156,7 +156,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
 class T5LayerNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
-        Construct a layernorm module in the T5 style No bias and no substraction of mean.
+        Construct a layernorm module in the T5 style No bias and no subtraction of mean.
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -256,7 +256,7 @@ class T5Attention(nn.Module):
             relative_position: an int32 Tensor
             bidirectional: a boolean - whether the attention is bidirectional
             num_buckets: an integer
-            max_distance: an intege
+            max_distance: an integer
 
         Returns:
             a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
@@ -705,7 +705,7 @@ class T5Stack(T5PreTrainedModel):
             raise ValueError(f"You have to specify either {err_msg_prefix}inputs or {err_msg_prefix}inputs_embeds")
 
         if inputs_embeds is None:
-            assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings"
+            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
             inputs_embeds = self.embed_tokens(input_ids)
 
         batch_size, seq_length = input_shape
diff --git a/src/transformers/modeling_tf_albert.py b/src/transformers/modeling_tf_albert.py
index 0159ed7139..bbf3ef72b6 100644
--- a/src/transformers/modeling_tf_albert.py
+++ b/src/transformers/modeling_tf_albert.py
@@ -739,7 +739,7 @@ ALBERT_INPUTS_DOCSTRING = r"""
 
 
 @add_start_docstrings(
-    "The bare Albert Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare Albert Model transformer outputting raw hidden-states without any specific head on top.",
     ALBERT_START_DOCSTRING,
 )
 class TFAlbertModel(TFAlbertPreTrainedModel):
diff --git a/src/transformers/modeling_tf_auto.py b/src/transformers/modeling_tf_auto.py
index c90028125c..a6620c4da4 100644
--- a/src/transformers/modeling_tf_auto.py
+++ b/src/transformers/modeling_tf_auto.py
@@ -364,14 +364,14 @@ TF_AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
             model_args (additional positional arguments, `optional`):
                 Will be passed along to the underlying model ``__init__()`` method.
             config (:class:`~transformers.PretrainedConfig`, `optional`):
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can
+                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
                 be automatically loaded when:
 
                     - The model is a model provided by the library (loaded with the `shortcut name` string of a
                       pretrained model).
                     - The model was saved using :meth:`~transformers.PreTrainedModel.save_pretrained` and is reloaded
-                      by suppling the save directory.
-                    - The model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a
+                      by suppyling the save directory.
+                    - The model is loaded by suppyling a local directory as ``pretrained_model_name_or_path`` and a
                       configuration JSON file named `config.json` is found in the directory.
             state_dict (`Dict[str, torch.Tensor]`, `optional`):
                 A state dictionary to use instead of a state dictionary loaded from saved weights file.
@@ -398,7 +398,7 @@ TF_AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
             output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
                 Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
             local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to only look at local files (e.g., not try doanloading the model).
+                Whether or not to only look at local files (e.g., not try downloading the model).
             use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
                 Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on
                 our S3 (faster). Should be set to :obj:`False` for checkpoints larger than 20GB.
@@ -815,7 +815,7 @@ class TFAutoModelForMaskedLM:
     This is a generic model class that will be instantiated as one of the model classes of the library---with a masked
     language modeling head---when created with the when created with the
     :meth:`~transformers.TFAutoModelForMaskedLM.from_pretrained` class method or the
-    :meth:`~transformers.TFAutoModelForMasedLM.from_config` class method.
+    :meth:`~transformers.TFAutoModelForMaskedLM.from_config` class method.
 
     This class cannot be instantiated directly using ``__init__()`` (throws an error).
     """
@@ -1297,7 +1297,7 @@ class TFAutoModelForTokenClassification:
 class TFAutoModelForMultipleChoice:
     r"""
     This is a generic model class that will be instantiated as one of the model classes of the library---with a
-    multiple choice classifcation head---when created with the when created with the
+    multiple choice classification head---when created with the when created with the
     :meth:`~transformers.TFAutoModelForMultipleChoice.from_pretrained` class method or the
     :meth:`~transformers.TFAutoModelForMultipleChoice.from_config` class method.
 
diff --git a/src/transformers/modeling_tf_bart.py b/src/transformers/modeling_tf_bart.py
index 64ce19f08a..9cef77d9d3 100644
--- a/src/transformers/modeling_tf_bart.py
+++ b/src/transformers/modeling_tf_bart.py
@@ -332,7 +332,7 @@ class TFBartEncoder(tf.keras.layers.Layer):
                 - **x** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)`
 
                 - **encoder_states** (List[Tensor]): all intermediate hidden states of shape `(src_len, batch,
-                  embed_dim)`. Only populated if *return_all_hiddens* is True.
+                  embed_dim)`. Only populated if *output_hidden_states* is True.
                 - **all_attentions** (List[Tensor]): Attention weights for each layer.
                 During training might not be of length n_layers because of layer dropout.
         """
diff --git a/src/transformers/modeling_tf_bert.py b/src/transformers/modeling_tf_bert.py
index f4e9a622b2..56e171d74a 100644
--- a/src/transformers/modeling_tf_bert.py
+++ b/src/transformers/modeling_tf_bert.py
@@ -784,7 +784,7 @@ BERT_INPUTS_DOCSTRING = r"""
 
 
 @add_start_docstrings(
-    "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
     BERT_START_DOCSTRING,
 )
 class TFBertModel(TFBertPreTrainedModel):
diff --git a/src/transformers/modeling_tf_distilbert.py b/src/transformers/modeling_tf_distilbert.py
index 56e23a5ccd..4239c38116 100644
--- a/src/transformers/modeling_tf_distilbert.py
+++ b/src/transformers/modeling_tf_distilbert.py
@@ -346,7 +346,7 @@ class TFTransformer(tf.keras.layers.Layer):
 
         Returns:
             hidden_state: tf.Tensor(bs, seq_length, dim)
-                Sequence of hiddens states in the last (top) layer
+                Sequence of hidden states in the last (top) layer
             all_hidden_states: Tuple[tf.Tensor(bs, seq_length, dim)]
                 Tuple of length n_layers with the hidden states from each layer.
                 Optional: only if output_hidden_states=True
@@ -552,7 +552,7 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
 
-        iinputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+        inputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
             Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
             This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
             vectors than the model's internal embedding lookup matrix.
@@ -571,7 +571,7 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
 
 
 @add_start_docstrings(
-    "The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
+    "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
     DISTILBERT_START_DOCSTRING,
 )
 class TFDistilBertModel(TFDistilBertPreTrainedModel):
diff --git a/src/transformers/modeling_tf_flaubert.py b/src/transformers/modeling_tf_flaubert.py
index be69b2d747..59604249ba 100644
--- a/src/transformers/modeling_tf_flaubert.py
+++ b/src/transformers/modeling_tf_flaubert.py
@@ -109,7 +109,7 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
             A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
             languages ids which can be obtained from the language names by using two conversion mappings provided in
             the configuration of the model (only provided for multilingual models). More precisely, the `language name
-            to language id` mapping is in :obj:`model.config.lang2id` (which is a dictionary strring to int) and the
+            to language id` mapping is in :obj:`model.config.lang2id` (which is a dictionary string to int) and the
             `language id to language name` mapping is in :obj:`model.config.id2lang` (dictionary int to string).
 
             See usage examples detailed in the :doc:`multilingual documentation <../multilingual>`.
@@ -128,7 +128,7 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
             `What are position IDs? <../glossary.html#position-ids>`__
         lengths (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size,)`, `optional`):
             Length of each sentence that can be used to avoid performing attention on padding token indices. You can
-            also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in
+            also use `attention_mask` for the same result (see above), kept here for compatibility Indices selected in
             ``[0, ..., input_ids.size(-1)]``:
         cache (:obj:`Dict[str, tf.Tensor]`, `optional`):
             Dictionary string to ``tf.FloatTensor`` that contains precomputed hidden states (key and values in the
@@ -214,7 +214,7 @@ class TFFlaubertPreTrainedModel(TFPreTrainedModel):
 
 
 @add_start_docstrings(
-    "The bare Flaubert Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.",
     FLAUBERT_START_DOCSTRING,
 )
 class TFFlaubertModel(TFFlaubertPreTrainedModel):
diff --git a/src/transformers/modeling_tf_funnel.py b/src/transformers/modeling_tf_funnel.py
index 4b706ed856..b5ee93c33b 100644
--- a/src/transformers/modeling_tf_funnel.py
+++ b/src/transformers/modeling_tf_funnel.py
@@ -178,7 +178,7 @@ class TFFunnelAttentionStructure:
         self.sin_dropout = tf.keras.layers.Dropout(config.hidden_dropout)
         self.cos_dropout = tf.keras.layers.Dropout(config.hidden_dropout)
         # Track where we are at in terms of pooling from the original input, e.g., by how much the sequence length was
-        # dividide.
+        # divided.
         self.pooling_mult = None
 
     def init_attention_inputs(self, inputs_embeds, attention_mask=None, token_type_ids=None, training=False):
@@ -219,7 +219,7 @@ class TFFunnelAttentionStructure:
         """
         if self.attention_type == "factorized":
             # Notations from the paper, appending A.2.2, final formula.
-            # We need to create and return the matrics phi, psi, pi and omega.
+            # We need to create and return the matrices phi, psi, pi and omega.
             pos_seq = tf.range(0, seq_len, 1.0, dtype=dtype)
             freq_seq = tf.range(0, self.d_model // 2, 1.0, dtype=dtype)
             inv_freq = 1 / (10000 ** (freq_seq / (self.d_model // 2)))
diff --git a/src/transformers/modeling_tf_gpt2.py b/src/transformers/modeling_tf_gpt2.py
index b12634b441..5d8658d83e 100644
--- a/src/transformers/modeling_tf_gpt2.py
+++ b/src/transformers/modeling_tf_gpt2.py
@@ -549,7 +549,7 @@ GPT2_INPUTS_DOCSTRING = r"""
 
 
 @add_start_docstrings(
-    "The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
     GPT2_START_DOCSTRING,
 )
 class TFGPT2Model(TFGPT2PreTrainedModel):
diff --git a/src/transformers/modeling_tf_longformer.py b/src/transformers/modeling_tf_longformer.py
index ebdbcc2773..817c407358 100644
--- a/src/transformers/modeling_tf_longformer.py
+++ b/src/transformers/modeling_tf_longformer.py
@@ -172,9 +172,9 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer):
         Returns: tf.Tensor
         """
         mask = tf.cast(tf.math.not_equal(x, self.padding_idx), dtype=tf.int32)
-        incremental_indicies = tf.math.cumsum(mask, axis=1) * mask
+        incremental_indices = tf.math.cumsum(mask, axis=1) * mask
 
-        return incremental_indicies + self.padding_idx
+        return incremental_indices + self.padding_idx
 
     def create_position_ids_from_inputs_embeds(self, inputs_embeds):
         """
@@ -560,7 +560,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
         # batch_size x num_heads x max_num_global_attention_tokens x sequence_length
         # which is the attention weights from tokens with global attention to all tokens
         # It doesn't not return local attention
-        # In case of variable number of global attantion in the rows of a batch,
+        # In case of variable number of global attention in the rows of a batch,
         # attn_probs are padded with -10000.0 attention scores
         # LOCAL ATTN:
         # without global attention, return local attention probabilities
@@ -618,7 +618,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
         chunked_query = self._chunk(query, window_overlap)
         chunked_key = self._chunk(key, window_overlap)
 
-        # matrix multipication
+        # matrix multiplication
         # bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
         # bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim
         # bcxy: batch_size * num_heads x chunks x 2window_overlap x 2window_overlap
@@ -826,7 +826,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
                                        -0.7584,  0.4206, -0.0405,  0.1599,
                                        2.0514, -1.1600,  0.5372,  0.2629 ]
               window_overlap = num_rows = 4
-             (pad & diagonilize) =>
+             (pad & diagonalize) =>
              [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
                0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
                0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
@@ -853,7 +853,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
 
     @staticmethod
     def _chunk(hidden_states, window_overlap):
-        """convert into overlapping chunkings. Chunk size = 2w, overlap size = w"""
+        """convert into overlapping chunks. Chunk size = 2w, overlap size = w"""
         batch_size, seq_length, hidden_dim = shape_list(hidden_states)
         num_output_chunks = 2 * (seq_length // (2 * window_overlap)) - 1
 
@@ -1557,7 +1557,7 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
 
             `What are attention masks? <../glossary.html#attention-mask>`__
         global_attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
-            Mask to decide the attention given on each token, local attention or global attenion. Tokens with global
+            Mask to decide the attention given on each token, local attention or global attention. Tokens with global
             attention attends to all other tokens, and all other tokens attend to them. This is important for
             task-specific finetuning because it makes the model more flexible at representing the task. For example,
             for classification, the <s> token should be given global attention. For QA, all question tokens should also
diff --git a/src/transformers/modeling_tf_lxmert.py b/src/transformers/modeling_tf_lxmert.py
index bd9d2e0ca4..e7c527eaaf 100644
--- a/src/transformers/modeling_tf_lxmert.py
+++ b/src/transformers/modeling_tf_lxmert.py
@@ -50,7 +50,7 @@ TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
 @dataclass
 class TFLxmertModelOutput(ModelOutput):
     """
-    Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilites for the language,
+    Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilities for the language,
     visual, and, cross-modality encoders. (note: the visual encoder in Lxmert is referred to as the "relation-ship"
     encoder")
 
@@ -423,7 +423,7 @@ class TFLxmertSelfAttentionLayer(tf.keras.layers.Layer):
         self.attention_output = TFLxmertAttentionOutput(config, name="output")
 
     def call(self, input_tensor, attention_mask, output_attentions, training=False):
-        # Self attention attends to itself, thus keys and querys are the same (input_tensor).
+        # Self attention attends to itself, thus keys and queries are the same (input_tensor).
         self_output = self.self(input_tensor, input_tensor, attention_mask, output_attentions)
         if output_attentions:
             attention_probs = self_output[1]
@@ -868,7 +868,7 @@ LXMERT_START_DOCSTRING = r"""
     <https://arxiv.org/abs/1908.07490>`__ by Hao Tan and Mohit Bansal. It's a vision and language transformer model,
     pre-trained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MCSCOCO captions, and Visual genome,
     using a combination of masked language modeling, region of interest feature regression, cross entropy loss for
-    question answering attribute prediction, and object tag predicition.
+    question answering attribute prediction, and object tag prediction.
 
     This model is also a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ subclass. Use
     it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
@@ -962,7 +962,7 @@ LXMERT_INPUTS_DOCSTRING = r"""
 
 
 @add_start_docstrings(
-    "The bare Lxmert Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare Lxmert Model transformer outputting raw hidden-states without any specific head on top.",
     LXMERT_START_DOCSTRING,
 )
 class TFLxmertModel(TFLxmertPreTrainedModel):
diff --git a/src/transformers/modeling_tf_mobilebert.py b/src/transformers/modeling_tf_mobilebert.py
index 2873b542b4..cd763cecf7 100644
--- a/src/transformers/modeling_tf_mobilebert.py
+++ b/src/transformers/modeling_tf_mobilebert.py
@@ -952,7 +952,7 @@ MOBILEBERT_INPUTS_DOCSTRING = r"""
 
 
 @add_start_docstrings(
-    "The bare MobileBert Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare MobileBert Model transformer outputting raw hidden-states without any specific head on top.",
     MOBILEBERT_START_DOCSTRING,
 )
 class TFMobileBertModel(TFMobileBertPreTrainedModel):
diff --git a/src/transformers/modeling_tf_openai.py b/src/transformers/modeling_tf_openai.py
index 218946b013..fb4eba7c54 100644
--- a/src/transformers/modeling_tf_openai.py
+++ b/src/transformers/modeling_tf_openai.py
@@ -487,7 +487,7 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
 
 
 @add_start_docstrings(
-    "The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
+    "The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.",
     OPENAI_GPT_START_DOCSTRING,
 )
 class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
index e784d2d4e5..ffd50e47c9 100644
--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -39,7 +39,7 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="")
     return tuple with:
 
         - pytorch model weight name
-        - transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each
+        - transpose: boolean indicating wether TF2.0 and PyTorch weights matrices are transposed with regards to each
           other
     """
     tf_name = tf_name.replace(":0", "")  # device ids
@@ -270,7 +270,7 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
     logger.info("Loading TensorFlow weights from {}".format(tf_checkpoint_path))
 
     # Instantiate and load the associated TF 2.0 model
-    tf_model_class_name = "TF" + pt_model.__class__.__name__  # Add "TF" at the beggining
+    tf_model_class_name = "TF" + pt_model.__class__.__name__  # Add "TF" at the beginning
     tf_model_class = getattr(transformers, tf_model_class_name)
     tf_model = tf_model_class(pt_model.config)
 
diff --git a/src/transformers/modeling_tf_roberta.py b/src/transformers/modeling_tf_roberta.py
index ce3dafed30..d36f935c33 100644
--- a/src/transformers/modeling_tf_roberta.py
+++ b/src/transformers/modeling_tf_roberta.py
@@ -118,9 +118,9 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
         Returns: tf.Tensor
         """
         mask = tf.cast(tf.math.not_equal(x, self.padding_idx), dtype=tf.int32)
-        incremental_indicies = tf.math.cumsum(mask, axis=1) * mask
+        incremental_indices = tf.math.cumsum(mask, axis=1) * mask
 
-        return incremental_indicies + self.padding_idx
+        return incremental_indices + self.padding_idx
 
     def create_position_ids_from_inputs_embeds(self, inputs_embeds):
         """
@@ -709,7 +709,7 @@ ROBERTA_INPUTS_DOCSTRING = r"""
 
 
 @add_start_docstrings(
-    "The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
     ROBERTA_START_DOCSTRING,
 )
 class TFRobertaModel(TFRobertaPreTrainedModel):
diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py
index 3dcb29ada2..827af120f8 100644
--- a/src/transformers/modeling_tf_t5.py
+++ b/src/transformers/modeling_tf_t5.py
@@ -71,7 +71,7 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
 class TFT5LayerNorm(tf.keras.layers.Layer):
     def __init__(self, epsilon=1e-6, **kwargs):
         """
-        Construct a layernorm module in the T5 style No bias and no substraction of mean.
+        Construct a layernorm module in the T5 style No bias and no subtraction of mean.
         """
         super().__init__(**kwargs)
         self.variance_epsilon = epsilon
@@ -170,7 +170,7 @@ class TFT5Attention(tf.keras.layers.Layer):
             relative_position: an int32 Tensor
             bidirectional: a boolean - whether the attention is bidirectional
             num_buckets: an integer
-            max_distance: an intege
+            max_distance: an integer
 
         Returns:
             a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
@@ -682,8 +682,8 @@ class TFT5MainLayer(tf.keras.layers.Layer):
 
         if self.is_decoder and encoder_attention_mask is not None:
             # If a 2D ou 3D attention mask is provided for the cross-attention
-            # we need to make broadcastabe to [batch_size, num_heads, mask_seq_length, mask_seq_length]
-            # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+            # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
+            # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
             encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=tf.float32)
             num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
             if num_dims_encoder_attention_mask == 3:
@@ -894,7 +894,7 @@ T5_INPUTS_DOCSTRING = r"""
             sequence of hidden states at the output of the last layer of the encoder. Used in the cross-attention of
             the decoder.
         past_key_values (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            ontains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
 
             If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
             (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
diff --git a/src/transformers/modeling_tf_transfo_xl.py b/src/transformers/modeling_tf_transfo_xl.py
index 5a86c7d646..3883a370c7 100644
--- a/src/transformers/modeling_tf_transfo_xl.py
+++ b/src/transformers/modeling_tf_transfo_xl.py
@@ -800,7 +800,7 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
 
 
 @add_start_docstrings(
-    "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
     TRANSFO_XL_START_DOCSTRING,
 )
 class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 9968a7fbd0..f33275c616 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -145,7 +145,7 @@ class TFCausalLanguageModelingLoss:
 
 class TFQuestionAnsweringLoss:
     """
-    Loss function suitable for quetion answering.
+    Loss function suitable for question answering.
     """
 
     def compute_loss(self, labels, logits):
@@ -807,7 +807,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
 
     Args:
         vocab_size (:obj:`int`):
-            The size of the vocabular, e.g., the number of unique tokens.
+            The size of the vocabulary, e.g., the number of unique tokens.
         hidden_size (:obj:`int`):
             The size of the embedding vectors.
         initializer_range (:obj:`float`, `optional`):
@@ -860,7 +860,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
             :obj:`tf.Tensor`: In embedding mode, the output is a float32 embedding tensor, with shape
             :obj:`[batch_size, length, embedding_size]`.
 
-            In linear mode, the ouput is a float32 with shape :obj:`[batch_size, length, vocab_size]`.
+            In linear mode, the output is a float32 with shape :obj:`[batch_size, length, vocab_size]`.
 
         Raises:
             ValueError: if :obj:`mode` is not valid.
@@ -1043,7 +1043,7 @@ def get_initializer(initializer_range: float = 0.02) -> tf.initializers.Truncate
 def cast_bool_to_primitive(bool_variable: Union[tf.Tensor, bool], default_tensor_to_true=False) -> bool:
     """
     Function arguments can be inserted as boolean tensor and bool variables to cope with Keras serialization we need to
-    cast the bool argumnets (like :obj:`output_attentions` for instance) to correct boolean if it is a tensor.
+    cast the bool arguments (like :obj:`output_attentions` for instance) to correct boolean if it is a tensor.
 
     Args:
         bool_variable (:obj:`Union[tf.Tensor, bool]`):
diff --git a/src/transformers/modeling_tf_xlm.py b/src/transformers/modeling_tf_xlm.py
index 901ca14ecc..d3724986c5 100644
--- a/src/transformers/modeling_tf_xlm.py
+++ b/src/transformers/modeling_tf_xlm.py
@@ -654,7 +654,7 @@ XLM_INPUTS_DOCSTRING = r"""
             `What are position IDs? <../glossary.html#position-ids>`__
         lengths (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size,)`, `optional`):
             Length of each sentence that can be used to avoid performing attention on padding token indices. You can
-            also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in
+            also use `attention_mask` for the same result (see above), kept here for compatibility. Indices selected in
             ``[0, ..., input_ids.size(-1)]``.
         cache (:obj:`Dict[str, tf.Tensor]`, `optional`):
             Dictionary string to ``torch.FloatTensor`` that contains precomputed hidden states (key and values in the
@@ -688,7 +688,7 @@ XLM_INPUTS_DOCSTRING = r"""
 
 
 @add_start_docstrings(
-    "The bare XLM Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
     XLM_START_DOCSTRING,
 )
 class TFXLMModel(TFXLMPreTrainedModel):
diff --git a/src/transformers/modeling_tf_xlnet.py b/src/transformers/modeling_tf_xlnet.py
index 70a10de343..3de2135f21 100644
--- a/src/transformers/modeling_tf_xlnet.py
+++ b/src/transformers/modeling_tf_xlnet.py
@@ -652,7 +652,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
         # data mask: input mask & perm mask
         assert input_mask is None or attention_mask is None, (
             "You can only use one of input_mask (uses 1 for padding) "
-            "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one."
+            "or attention_mask (uses 0 for padding, added for compatibility with BERT). Please choose one."
         )
         if input_mask is None and attention_mask is not None:
             input_mask = 1.0 - tf.cast(attention_mask, dtype=dtype_float)
@@ -1122,7 +1122,7 @@ XLNET_INPUTS_DOCSTRING = r"""
 
 
 @add_start_docstrings(
-    "The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
     XLNET_START_DOCSTRING,
 )
 class TFXLNetModel(TFXLNetPreTrainedModel):
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 55b2116680..195df86810 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -359,7 +359,7 @@ class ModuleUtilsMixin:
         Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a
         batch with this transformer model. Default approximation neglects the quadratic dependency on the number of
         tokens (valid if :obj:`12 * d_model << sequence_length`) as laid out in `this paper
-        <https://arxiv.org/pdf/2001.08361.pdf>`__ section 2.1. Should be overriden for transformers with parameter
+        <https://arxiv.org/pdf/2001.08361.pdf>`__ section 2.1. Should be overridden for transformers with parameter
         re-use e.g. Albert or Universal Transformers, or if doing long-range modeling with very high sequence lengths.
 
         Args:
diff --git a/src/transformers/modeling_xlm.py b/src/transformers/modeling_xlm.py
index e404891181..fda792f570 100755
--- a/src/transformers/modeling_xlm.py
+++ b/src/transformers/modeling_xlm.py
@@ -366,7 +366,7 @@ XLM_INPUTS_DOCSTRING = r"""
             `What are position IDs? <../glossary.html#position-ids>`__
         lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
             Length of each sentence that can be used to avoid performing attention on padding token indices. You can
-            also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in
+            also use `attention_mask` for the same result (see above), kept here for compatibility. Indices selected in
             ``[0, ..., input_ids.size(-1)]``.
         cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`):
             Dictionary string to ``torch.FloatTensor`` that contains precomputed hidden states (key and values in the
diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py
index 90a90715f9..2264cdf302 100755
--- a/src/transformers/modeling_xlnet.py
+++ b/src/transformers/modeling_xlnet.py
@@ -1132,7 +1132,7 @@ class XLNetModel(XLNetPreTrainedModel):
 
         # data mask: input mask & perm mask
         assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) "
-        "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one."
+        "or attention_mask (uses 0 for padding, added for compatibility with BERT). Please choose one."
         if input_mask is None and attention_mask is not None:
             input_mask = 1.0 - attention_mask
         if input_mask is not None and perm_mask is not None:
diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py
index 1e99051bd7..370c10077e 100644
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -97,7 +97,7 @@ def create_optimizer(
     Args:
         init_lr (:obj:`float`):
             The desired learning rate at the end of the warmup phase.
-        num_train_step (:obj:`int`):
+        num_train_steps (:obj:`int`):
             The total number of training steps.
         num_warmup_steps (:obj:`int`):
             The number of warmup steps.
diff --git a/src/transformers/retrieval_rag.py b/src/transformers/retrieval_rag.py
index 820f41f32c..30e8c1c951 100644
--- a/src/transformers/retrieval_rag.py
+++ b/src/transformers/retrieval_rag.py
@@ -465,8 +465,6 @@ class RagRetriever:
         Postprocessing retrieved ``docs`` and combining them with ``input_strings``.
 
         Args:
-            doc_scores (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`):
-                Retrieval scores of respective docs - passed for logging.
             docs  (:obj:`dict`):
                 Retrieved documents.
             input_strings (:obj:`str`):
diff --git a/src/transformers/tokenization_bert.py b/src/transformers/tokenization_bert.py
index a75fa9e9a8..1e58a54266 100644
--- a/src/transformers/tokenization_bert.py
+++ b/src/transformers/tokenization_bert.py
@@ -293,7 +293,7 @@ class BertTokenizer(PreTrainedTokenizer):
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
diff --git a/src/transformers/tokenization_bertweet.py b/src/transformers/tokenization_bertweet.py
index 2143beedb2..0c77a9fc2c 100644
--- a/src/transformers/tokenization_bertweet.py
+++ b/src/transformers/tokenization_bertweet.py
@@ -223,7 +223,7 @@ class BertweetTokenizer(PreTrainedTokenizer):
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
diff --git a/src/transformers/tokenization_camembert.py b/src/transformers/tokenization_camembert.py
index d867f62db5..3abc5d78bb 100644
--- a/src/transformers/tokenization_camembert.py
+++ b/src/transformers/tokenization_camembert.py
@@ -184,7 +184,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
diff --git a/src/transformers/tokenization_camembert_fast.py b/src/transformers/tokenization_camembert_fast.py
index f111c76d14..31a5066072 100644
--- a/src/transformers/tokenization_camembert_fast.py
+++ b/src/transformers/tokenization_camembert_fast.py
@@ -191,7 +191,7 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
diff --git a/src/transformers/tokenization_deberta.py b/src/transformers/tokenization_deberta.py
index e59b34ea2f..c12c7a2734 100644
--- a/src/transformers/tokenization_deberta.py
+++ b/src/transformers/tokenization_deberta.py
@@ -623,7 +623,7 @@ class DebertaTokenizer(PreTrainedTokenizer):
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(
                 map(
diff --git a/src/transformers/tokenization_fsmt.py b/src/transformers/tokenization_fsmt.py
index 767e379981..0f5420407c 100644
--- a/src/transformers/tokenization_fsmt.py
+++ b/src/transformers/tokenization_fsmt.py
@@ -431,7 +431,7 @@ class FSMTTokenizer(PreTrainedTokenizer):
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(
                 map(
diff --git a/src/transformers/tokenization_herbert.py b/src/transformers/tokenization_herbert.py
index 15487bb515..664b93b512 100644
--- a/src/transformers/tokenization_herbert.py
+++ b/src/transformers/tokenization_herbert.py
@@ -40,7 +40,7 @@ class HerbertTokenizer(XLMTokenizer):
 
     Peculiarities:
 
-    - uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurence of a
+    - uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of a
       punctuation character will be treated separately.
 
     - Such pretokenized input is BPE subtokenized
diff --git a/src/transformers/tokenization_herbert_fast.py b/src/transformers/tokenization_herbert_fast.py
index aa0b344d42..642f8aa1ba 100644
--- a/src/transformers/tokenization_herbert_fast.py
+++ b/src/transformers/tokenization_herbert_fast.py
@@ -39,8 +39,8 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
 
     Peculiarities:
 
-    - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurence of a
-      punctuation character will be treated separately.
+    - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of
+      a punctuation character will be treated separately.
 
     This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
     should refer to the superclass for more information regarding methods.
@@ -122,7 +122,7 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
diff --git a/src/transformers/tokenization_mbart.py b/src/transformers/tokenization_mbart.py
index 4f4d880a8d..4392395e3e 100644
--- a/src/transformers/tokenization_mbart.py
+++ b/src/transformers/tokenization_mbart.py
@@ -136,7 +136,7 @@ class MBartTokenizer(XLMRobertaTokenizer):
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
         prefix_ones = [1] * len(self.prefix_tokens)
diff --git a/src/transformers/tokenization_mbart_fast.py b/src/transformers/tokenization_mbart_fast.py
index a3aeae357f..0eadfb8546 100644
--- a/src/transformers/tokenization_mbart_fast.py
+++ b/src/transformers/tokenization_mbart_fast.py
@@ -132,7 +132,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
         prefix_ones = [1] * len(self.prefix_tokens)
diff --git a/src/transformers/tokenization_phobert.py b/src/transformers/tokenization_phobert.py
index 1558b9c102..7e36eee660 100644
--- a/src/transformers/tokenization_phobert.py
+++ b/src/transformers/tokenization_phobert.py
@@ -204,7 +204,7 @@ class PhobertTokenizer(PreTrainedTokenizer):
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
diff --git a/src/transformers/tokenization_prophetnet.py b/src/transformers/tokenization_prophetnet.py
index db87520cd3..6936c032f9 100644
--- a/src/transformers/tokenization_prophetnet.py
+++ b/src/transformers/tokenization_prophetnet.py
@@ -206,7 +206,7 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
diff --git a/src/transformers/tokenization_transfo_xl.py b/src/transformers/tokenization_transfo_xl.py
index d3f8613d0d..d6b1f3a4f4 100644
--- a/src/transformers/tokenization_transfo_xl.py
+++ b/src/transformers/tokenization_transfo_xl.py
@@ -129,7 +129,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
         lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
             Whether or not to lowercase the input when tokenizing.
         delimiter (:obj:`str`, `optional`):
-            The delimiter used btween tokens.
+            The delimiter used between tokens.
         vocab_file (:obj:`str`, `optional`):
             File containing the vocabulary (from the original implementation).
         pretrained_vocab_file (:obj:`str`, `optional`):
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index 6c2722fc38..612616a515 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -53,7 +53,7 @@ TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
 
 def _is_whitespace(char):
     """Checks whether `char` is a whitespace character."""
-    # \t, \n, and \r are technically contorl characters but we treat them
+    # \t, \n, and \r are technically control characters but we treat them
     # as whitespace since they are generally considered as such.
     if char == " " or char == "\t" or char == "\n" or char == "\r":
         return True
@@ -367,7 +367,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
         vocabulary.
 
         Args:
-            token (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
+            tokens (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
 
         Returns:
             :obj:`int` or :obj:`List[int]`: The token id or list of token ids.
@@ -644,7 +644,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
         :obj:`kwargs` at the end of the encoding process to be sure all the arguments have been used.
 
         Args:
-            test (:obj:`str`):
+            text (:obj:`str`):
                 The text to prepare.
             is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
                 Whether or not the text has been pretokenized.
@@ -669,7 +669,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
             token_ids_1 (:obj:`List[int]`, `optional`):
                 List of ids of the second sequence.
             already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formated with special tokens for the model.
+                Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
             A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
@@ -732,7 +732,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
         filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
 
         # To avoid mixing byte-level and unicode for byte-level BPT
-        # we need to build string separatly for added tokens and byte-level tokens
+        # we need to build string separately for added tokens and byte-level tokens
         # cf. https://github.com/huggingface/transformers/issues/1133
         sub_texts = []
         current_sub_text = []
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 0622e78c8b..6a0156d608 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """
 Base classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user
-fronting encoding methodes) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary
+fronting encoding methods) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary
 of output with special method for the Fast tokenizers)
 """
 
@@ -537,10 +537,10 @@ class BatchEncoding(UserDict):
         Args:
             batch_or_char_index (:obj:`int`):
                 Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
-                the character in the orginal string.
+                the character in the original string.
             char_index (:obj:`int`, `optional`):
                 If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the
-                orginal string.
+                original string.
 
 
         Returns:
@@ -607,7 +607,7 @@ class BatchEncoding(UserDict):
 
                 tensor = as_tensor(value)
 
-                # Removing this for now in favor of controling the shape with `prepend_batch_axis`
+                # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
                 # # at-least2d
                 # if tensor.ndim > 2:
                 #     tensor = tensor.squeeze(0)
@@ -648,7 +648,7 @@ class SpecialTokensMixin:
     """
     A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` to
     handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be
-    used to directly access these special tokens in a model-independant manner and allow to set and update the special
+    used to directly access these special tokens in a model-independent manner and allow to set and update the special
     tokens.
 
     Args:
@@ -696,8 +696,8 @@ class SpecialTokensMixin:
         self.verbose = verbose
 
         # We directly set the hidden value to allow initialization with special tokens
-        # which are not yet in the vocabulary. Necesssary for serialization/de-serialization
-        # TODO clean this up at some point (probably by sitching to fast tokenizers)
+        # which are not yet in the vocabulary. Necessary for serialization/de-serialization
+        # TODO clean this up at some point (probably by switching to fast tokenizers)
         for key, value in kwargs.items():
             if value is None:
                 continue
@@ -721,7 +721,7 @@ class SpecialTokensMixin:
         Add the missing ones to the vocabulary if needed.
 
         Return:
-            :obj:`int`: The number of tokens added in the vocaulary during the operation.
+            :obj:`int`: The number of tokens added in the vocabulary during the operation.
         """
         return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
 
@@ -805,7 +805,7 @@ class SpecialTokensMixin:
                 string token to let you personalize its behavior: whether this token should only match against a single
                 word, whether this token should strip all potential whitespaces on the left side, whether this token
                 should strip all potential whitespaces on the right side, etc.
-            special_token (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                 Can be used to specify if the token is a special token. This mostly change the normalization behavior
                 (special tokens like CLS or [MASK] are usually not lower-cased for instance).
 
@@ -1799,7 +1799,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            modifying :obj:`tokenizer.do_lower_case` after creation).
 
         Args:
-            save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
+            save_directory (:obj:`str`): The path to a directory where the tokenizer will be saved.
             legacy_format (:obj:`bool`, `optional`, defaults to :obj:`True`):
                 Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and a
                 separate added_tokens files or in the unified JSON file format for the `tokenizers` library. It's only
@@ -2006,15 +2006,15 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
         # If you only set max_length, it activates truncation for max_length
         if max_length is not None and padding is False and truncation is False:
             if verbose:
-                if not self.deprecation_warnings.get("Truncation-not-explicitely-activated", False):
+                if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
                     logger.warning(
-                        "Truncation was not explicitely activated but `max_length` is provided a specific value, "
-                        "please use `truncation=True` to explicitely truncate examples to max length. "
+                        "Truncation was not explicitly activated but `max_length` is provided a specific value, "
+                        "please use `truncation=True` to explicitly truncate examples to max length. "
                         "Defaulting to 'longest_first' truncation strategy. "
                         "If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
                         "more precisely by providing a specific strategy to `truncation`."
                     )
-                self.deprecation_warnings["Truncation-not-explicitely-activated"] = True
+                self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
             truncation = "longest_first"
 
         # Get padding strategy
@@ -2591,7 +2591,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
         Create the token type IDs corresponding to the sequences passed. `What are token type IDs?
         <../glossary.html#token-type-ids>`__
 
-        Should be overriden in a subclass if the model has a special way of building those.
+        Should be overridden in a subclass if the model has a special way of building those.
 
         Args:
             token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
@@ -2611,7 +2611,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens.
 
-        This implementation does not add special tokens and this method should be overriden in a subclass.
+        This implementation does not add special tokens and this method should be overridden in a subclass.
 
         Args:
             token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
@@ -2783,7 +2783,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                 and ``convert_tokens_to_ids`` methods.
             num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0):
                 Number of tokens to remove using the truncation strategy.
-            truncation (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
+            truncation_strategy (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
                 The strategy to follow for truncation. Can be:
 
                 * :obj:`'longest_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
@@ -2798,12 +2798,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                   truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                 * :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
                   greater than the model maximum admissible input size).
-            max_length (:obj:`int`, `optional`):
-                Controls the maximum length to use by one of the truncation/padding parameters.
-
-                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
-                length is required by one of the truncation/padding parameters. If the model has no specific maximum
-                input length (like XLNet) truncation/padding to a maximum length will be deactivated.
             stride (:obj:`int`, `optional`, defaults to 0):
                 If set to a positive number, the overflowing tokens returned will contain some tokens from the main
                 sequence returned. The value of this argument defines the number of additional tokens.
@@ -2871,7 +2865,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
-        Pad encoded inputs (on left/right and up to predefined legnth or max length in the batch)
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
 
         Args:
             encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
@@ -3037,7 +3031,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
             token_ids_1 (:obj:`List[int]`, `optional`):
                 List of ids of the second sequence.
             already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formated with special tokens for the model.
+                Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
             A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
@@ -3058,7 +3052,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
     @staticmethod
     def clean_up_tokenization(out_string: str) -> str:
         """
-        Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
+        Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
 
         Args:
             out_string (:obj:`str`): The text to clean up.
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 8754c3334d..8552aae9d2 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -53,7 +53,7 @@ TOKENIZER_FILE = "tokenizer.json"
 SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
 TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
 
-# Slow tokenizers have an additional addedd tokens files
+# Slow tokenizers have an additional added tokens files
 ADDED_TOKENS_FILE = "added_tokens.json"
 
 
@@ -211,7 +211,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
         vocabulary.
 
         Args:
-            token (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
+            tokens (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
 
         Returns:
             :obj:`int` or :obj:`List[int]`: The token id or list of token ids.
@@ -473,7 +473,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
         )
 
         # Return tensor is None, then we can remove the leading batch axis
-        # Overfolwing tokens are returned as a batch of output so we keep them in this case
+        # Overflowing tokens are returned as a batch of output so we keep them in this case
         if return_tensors is None and not return_overflowing_tokens:
             batched_output = BatchEncoding(
                 {
diff --git a/src/transformers/tokenization_xlm.py b/src/transformers/tokenization_xlm.py
index a08d1d9fad..df6eb74113 100644
--- a/src/transformers/tokenization_xlm.py
+++ b/src/transformers/tokenization_xlm.py
@@ -909,7 +909,7 @@ class XLMTokenizer(PreTrainedTokenizer):
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(
                 map(
diff --git a/src/transformers/tokenization_xlm_prophetnet.py b/src/transformers/tokenization_xlm_prophetnet.py
index e92291a87e..994461ea78 100644
--- a/src/transformers/tokenization_xlm_prophetnet.py
+++ b/src/transformers/tokenization_xlm_prophetnet.py
@@ -202,7 +202,7 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
diff --git a/src/transformers/tokenization_xlm_roberta.py b/src/transformers/tokenization_xlm_roberta.py
index 265645f3ce..031b6ab072 100644
--- a/src/transformers/tokenization_xlm_roberta.py
+++ b/src/transformers/tokenization_xlm_roberta.py
@@ -205,7 +205,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
diff --git a/src/transformers/tokenization_xlm_roberta_fast.py b/src/transformers/tokenization_xlm_roberta_fast.py
index 8cfc53484b..27878706d1 100644
--- a/src/transformers/tokenization_xlm_roberta_fast.py
+++ b/src/transformers/tokenization_xlm_roberta_fast.py
@@ -194,7 +194,7 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
diff --git a/src/transformers/tokenization_xlnet.py b/src/transformers/tokenization_xlnet.py
index d7b83adc7a..fb3b9465a4 100644
--- a/src/transformers/tokenization_xlnet.py
+++ b/src/transformers/tokenization_xlnet.py
@@ -270,7 +270,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
diff --git a/src/transformers/tokenization_xlnet_fast.py b/src/transformers/tokenization_xlnet_fast.py
index 025252e031..671b39da96 100644
--- a/src/transformers/tokenization_xlnet_fast.py
+++ b/src/transformers/tokenization_xlnet_fast.py
@@ -209,7 +209,7 @@ class XLNetTokenizerFast(PreTrainedTokenizerFast):
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 69b346d063..7c554a6b79 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1524,8 +1524,6 @@ class Trainer:
         method in the model or subclass and override this method.
 
         Args:
-            model (:obj:`nn.Module`):
-                The model to evaluate.
             inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                 The inputs and targets of the model.
 
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index 14fd40f3e2..f4cf668e42 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -135,7 +135,7 @@ def torch_distributed_zero_first(local_rank: int):
 
 class SequentialDistributedSampler(Sampler):
     """
-    Distributed Sampler that subsamples indicies sequentially, making it easier to collate all results at the end.
+    Distributed Sampler that subsamples indices sequentially, making it easier to collate all results at the end.
 
     Even though we only use this sampler for eval and predict (no training), which means that the model params won't
     have to be synced (i.e. will not hang for synchronization even if varied number of forward passes), we still add
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index fe88d2aa4b..2fc6ffd710 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -402,7 +402,7 @@ class TrainingArguments:
             n_gpu = torch.cuda.device_count()
         else:
             # Here, we'll use torch.distributed.
-            # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+            # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
             torch.distributed.init_process_group(backend="nccl")
             device = torch.device("cuda", self.local_rank)
             n_gpu = 1
diff --git a/src/transformers/utils/notebook.py b/src/transformers/utils/notebook.py
index 92bd1a62da..fd986e2639 100644
--- a/src/transformers/utils/notebook.py
+++ b/src/transformers/utils/notebook.py
@@ -205,7 +205,7 @@ class NotebookTrainingTracker(NotebookProgressBar):
 
         num_steps (:obj:`int`): The number of steps during training.
         column_names (:obj:`List[str]`, `optional`):
-            The list of column names for the metrics table (will be infered from the first call to
+            The list of column names for the metrics table (will be inferred from the first call to
             :meth:`~transformers.utils.notebook.NotebookTrainingTracker.write_line` if not set).
     """
 
@@ -246,7 +246,7 @@ class NotebookTrainingTracker(NotebookProgressBar):
 
     def add_child(self, total, prefix=None, width=300):
         """
-        Add a child progress bar disaplyed under the table of metrics. The child progress bar is returned (so it can be
+        Add a child progress bar displayed under the table of metrics. The child progress bar is returned (so it can be
         easily updated).
 
         Args:
diff --git a/templates/adding_a_new_example_script/run_xxx.py b/templates/adding_a_new_example_script/run_xxx.py
index 2cab2a5d76..77b1b1bafd 100644
--- a/templates/adding_a_new_example_script/run_xxx.py
+++ b/templates/adding_a_new_example_script/run_xxx.py
@@ -45,7 +45,7 @@ from utils_squad import (
     write_predictions_extended,
 )
 
-# The follwing import is the official SQuAD evaluation script (2.0).
+# The following import is the official SQuAD evaluation script (2.0).
 # You can remove it from the dependencies if you are using this script outside of the library
 # We've added it here for automated tests (see examples/test_examples.py file)
 from utils_squad_evaluate import EVAL_OPTS
diff --git a/templates/adding_a_new_example_script/utils_xxx.py b/templates/adding_a_new_example_script/utils_xxx.py
index 48967b3366..653be61c6b 100644
--- a/templates/adding_a_new_example_script/utils_xxx.py
+++ b/templates/adding_a_new_example_script/utils_xxx.py
@@ -426,8 +426,8 @@ def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_ans
     #
     # However, this is not always possible. Consider the following:
     #
-    #   Question: What country is the top exporter of electornics?
-    #   Context: The Japanese electronics industry is the lagest in the world.
+    #   Question: What country is the top exporter of electronics?
+    #   Context: The Japanese electronics industry is the largest in the world.
     #   Answer: Japan
     #
     # In this case, the annotator chose "Japan" as a character sub-span of
diff --git a/templates/adding_a_new_model/configuration_xxx.py b/templates/adding_a_new_model/configuration_xxx.py
index 34dc225195..e11c638700 100644
--- a/templates/adding_a_new_model/configuration_xxx.py
+++ b/templates/adding_a_new_model/configuration_xxx.py
@@ -57,7 +57,7 @@ class XxxConfig(PretrainedConfig):
 
             If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py
index e4d2a0ac17..e512dbd7ad 100644
--- a/templates/adding_a_new_model/modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
@@ -352,7 +352,7 @@ XXX_INPUTS_DOCSTRING = r"""
 
 
 @add_start_docstrings(
-    "The bare XXX Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare XXX Model transformer outputting raw hidden-states without any specific head on top.",
     XXX_START_DOCSTRING,
 )
 class TFXxxModel(TFXxxPreTrainedModel):
diff --git a/templates/adding_a_new_model/tokenization_xxx.py b/templates/adding_a_new_model/tokenization_xxx.py
index 94b9eba7d5..2df0960305 100644
--- a/templates/adding_a_new_model/tokenization_xxx.py
+++ b/templates/adding_a_new_model/tokenization_xxx.py
@@ -237,7 +237,7 @@ class XxxTokenizer(PreTrainedTokenizer):
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
diff --git a/tests/test_logging.py b/tests/test_logging.py
index e94bf53b42..843d59b981 100644
--- a/tests/test_logging.py
+++ b/tests/test_logging.py
@@ -34,7 +34,7 @@ class HfArgumentParserTest(unittest.TestCase):
         logger = logging.get_logger("transformers.tokenization_bart")
         msg = "Testing 1, 2, 3"
 
-        # should be able to log warnings (if default settings weren't overriden by `pytest --log-level-all`)
+        # should be able to log warnings (if default settings weren't overridden by `pytest --log-level-all`)
         if level_origin <= logging.WARNING:
             with CaptureLogger(logger) as cl:
                 logger.warn(msg)
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 552651faa3..1043736f1f 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -1130,7 +1130,7 @@ class UtilsFunctionsTest(unittest.TestCase):
                     2.12662941,
                     -9.32562038,
                     2.35652522,
-                ],  # cummulative prob of 5 highest values <= 0.6
+                ],  # cumulative prob of 5 highest values <= 0.6
                 [
                     0.58425518,
                     4.53139238,
@@ -1162,7 +1162,7 @@ class UtilsFunctionsTest(unittest.TestCase):
                     9.67702323,  # 1st highest value; idx. 27
                     -5.89478553,
                     1.85370467,
-                ],  # cummulative prob of 5 highest values <= 0.6
+                ],  # cumulative prob of 5 highest values <= 0.6
             ],
             dtype=torch.float,
             device=torch_device,
diff --git a/tests/test_modeling_tf_lxmert.py b/tests/test_modeling_tf_lxmert.py
index 89c67c9290..3e78a5e764 100644
--- a/tests/test_modeling_tf_lxmert.py
+++ b/tests/test_modeling_tf_lxmert.py
@@ -497,7 +497,7 @@ class TFLxmertModelTest(TFModelTesterMixin, unittest.TestCase):
                 return_obj_labels="PreTraining" in model_class.__name__
             )
 
-            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beggining
+            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
             pt_model_class = getattr(transformers, pt_model_class_name)
 
             config.output_hidden_states = True
diff --git a/utils/check_copies.py b/utils/check_copies.py
index d8c0f23f85..ab84706797 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -100,7 +100,7 @@ def is_copy_consistent(filename, overwrite=False):
         lines = f.readlines()
     diffs = []
     line_index = 0
-    # Not a foor loop cause `lines` is going to change (if `overwrite=True`).
+    # Not a for loop cause `lines` is going to change (if `overwrite=True`).
     while line_index < len(lines):
         search = _re_copy_warning.search(lines[line_index])
         if search is None:
@@ -164,9 +164,9 @@ def check_copies(overwrite: bool = False):
     if not overwrite and len(diffs) > 0:
         diff = "\n".join(diffs)
         raise Exception(
-            "Found the follwing copy inconsistencies:\n"
+            "Found the following copy inconsistencies:\n"
             + diff
-            + "\nRun `make fix-copies` or `python utils/check_copies --fix_and_overwrite` to fix them."
+            + "\nRun `make fix-copies` or `python utils/check_copies.py --fix_and_overwrite` to fix them."
         )
     check_model_list_copy(overwrite=overwrite)