diff --git a/docs/source/main_classes/callback.rst b/docs/source/main_classes/callback.rst
index 1b31c54645..f4160185bf 100644
--- a/docs/source/main_classes/callback.rst
+++ b/docs/source/main_classes/callback.rst
@@ -12,7 +12,7 @@ subclass :class:`~transformers.Trainer` and override the methods you need (see :
 
 By default a :class:`~transformers.Trainer` will use the following callbacks:
 
-- :class:`~transformers.DefaultFlowCallback` which handles the default beahvior for logging, saving and evaluation.
+- :class:`~transformers.DefaultFlowCallback` which handles the default behavior for logging, saving and evaluation.
 - :class:`~transformers.PrinterCallback` or :class:`~transformers.ProrgressCallback` to display progress and print the
   logs (the first one is used if you deactivate tqdm through the :class:`~transformers.TrainingArguments`, otherwise
   it's the second one).
diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst
index 07050d1707..54e7c51595 100644
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -15,7 +15,7 @@ Both :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` contain
 previous features. To inject custom behavior you can subclass them and override the following methods:
 
 - **get_train_dataloader**/**get_train_tfdataset** -- Creates the training DataLoader (PyTorch) or TF Dataset.
-- **get_eval_dataloader**/**get_eval_tfdataset** -- Creates the evaulation DataLoader (PyTorch) or TF Dataset.
+- **get_eval_dataloader**/**get_eval_tfdataset** -- Creates the evaluation DataLoader (PyTorch) or TF Dataset.
 - **get_test_dataloader**/**get_test_tfdataset** -- Creates the test DataLoader (PyTorch) or TF Dataset.
 - **log** -- Logs information on the various objects watching training.
 - **create_optimizer_and_scheduler** -- Setups the optimizer and learning rate scheduler if they were not passed at
diff --git a/docs/source/philosophy.rst b/docs/source/philosophy.rst
index b449aeb083..e111df0f2e 100644
--- a/docs/source/philosophy.rst
+++ b/docs/source/philosophy.rst
@@ -66,7 +66,7 @@ The library is built around three types of classes for each model:
 All these classes can be instantiated from pretrained instances and saved locally using two methods:
 
 - :obj:`from_pretrained()` lets you instantiate a model/configuration/tokenizer from a pretrained version either
-  provided by the library itself (the suported models are provided in the list :doc:`here <pretrained_models>`
+  provided by the library itself (the supported models are provided in the list :doc:`here <pretrained_models>`
   or stored locally (or on a server) by the user,
 - :obj:`save_pretrained()` lets you save a model/configuration/tokenizer locally so that it can be reloaded using
   :obj:`from_pretrained()`.
diff --git a/examples/seq2seq/bertabs/README.md b/examples/seq2seq/bertabs/README.md
index 7835e8bc84..d5e6bbbaa2 100644
--- a/examples/seq2seq/bertabs/README.md
+++ b/examples/seq2seq/bertabs/README.md
@@ -39,7 +39,7 @@ python run_summarization.py \
     --compute_rouge true
 ```
 
-The scripts executes on GPU if one is available and if `no_cuda` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize).
+The scripts executes on GPU if one is available and if `no_cuda` is not set to `true`. Inference on multiple GPUs is not supported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize).
 
 ## Summarize any text
 
diff --git a/src/transformers/configuration_mmbt.py b/src/transformers/configuration_mmbt.py
index d650ada16b..cae65ab5c5 100644
--- a/src/transformers/configuration_mmbt.py
+++ b/src/transformers/configuration_mmbt.py
@@ -31,7 +31,7 @@ class MMBTConfig(object):
             Config of the underlying Transformer models. Its values are copied over to use a single config.
         num_labels (:obj:`int`, `optional`):
             Size of final Linear layer for classification.
-        modal_hidden_size (:obj:`int`, `optional`, defautls to 2048):
+        modal_hidden_size (:obj:`int`, `optional`, defaults to 2048):
             Embedding dimension of the non-text modality encoder.
     """
 
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 68744c0f1e..0509a0e1ad 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -274,7 +274,7 @@ class PretrainedConfig(object):
                 Path to a directory in which a downloaded pretrained model configuration should be cached if the
                 standard cache should not be used.
             force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Wheter or not to force to (re-)download the configuration files and override the cached versions if they
+                Whether or not to force to (re-)download the configuration files and override the cached versions if they
                 exist.
             resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
                 Whether or not to delete incompletely received file. Attempts to resume the download if such a file
diff --git a/src/transformers/convert_graph_to_onnx.py b/src/transformers/convert_graph_to_onnx.py
index dbdc00249c..ca6a6fec48 100644
--- a/src/transformers/convert_graph_to_onnx.py
+++ b/src/transformers/convert_graph_to_onnx.py
@@ -211,7 +211,7 @@ def load_graph_from_args(pipeline_name: str, framework: str, model: str, tokeniz
         pipeline_name: The kind of pipeline to use (ner, question-answering, etc.)
         framework: The actual model to convert the pipeline from ("pt" or "tf")
         model: The model name which will be loaded by the pipeline
-        tokenizer: The tokenizer name which will be loaded by the pipeline, defaut to the model's value
+        tokenizer: The tokenizer name which will be loaded by the pipeline, default to the model's value
 
     Returns: Pipeline object
 
diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
index f3b54b3470..8fd2352473 100644
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -560,7 +560,7 @@ class SquadProcessor(DataProcessor):
 
         Args:
             dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")`
-            evaluate: boolean specifying if in evaluation mode or in training mode
+            evaluate: Boolean specifying if in evaluation mode or in training mode
 
         Returns:
             List of SquadExample
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index d79072b361..d92ee0bf66 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -1093,7 +1093,7 @@ def is_tensor(x):
 class ModelOutput(OrderedDict):
     """
     Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like
-    a tuple) or strings (like a dictionnary) that will ignore the ``None`` attributes. Otherwise behaves like a
+    a tuple) or strings (like a dictionary) that will ignore the ``None`` attributes. Otherwise behaves like a
     regular python dictionary.
 
     .. warning::
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index 27f815bd18..036899ed81 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -197,7 +197,7 @@ class TensorBoardCallback(TrainerCallback):
 
     Args:
         tb_writer (:obj:`SummaryWriter`, `optional`):
-            The writer to use. Will instatiate one if not set.
+            The writer to use. Will instantiate one if not set.
     """
 
     def __init__(self, tb_writer=None):
diff --git a/src/transformers/modeling_auto.py b/src/transformers/modeling_auto.py
index 5dd0201bbf..9a2a4584a6 100644
--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -507,7 +507,7 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
                 :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
                 request.
             output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether ot not to also return a dictionnary containing missing keys, unexpected keys and error
+                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error
                 messages.
             local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
                 Whether or not to only look at local files (e.g., not try doanloading the model).
diff --git a/src/transformers/modeling_tf_auto.py b/src/transformers/modeling_tf_auto.py
index 3a547bf491..f8f073b089 100644
--- a/src/transformers/modeling_tf_auto.py
+++ b/src/transformers/modeling_tf_auto.py
@@ -390,7 +390,7 @@ TF_AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
                 :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
                 request.
             output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether ot not to also return a dictionnary containing missing keys, unexpected keys and error
+                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error
                 messages.
             local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
                 Whether or not to only look at local files (e.g., not try doanloading the model).
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 11f361590a..1a2e83d028 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -569,7 +569,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
                 :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
                 request.
             output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether ot not to also return a dictionnary containing missing keys, unexpected keys and error
+                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error
                 messages.
             local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
                 Whether or not to only look at local files (e.g., not try doanloading the model).
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 009fdc86cb..4a24fedd3f 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -802,7 +802,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
                 :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
                 request.
             output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether ot not to also return a dictionnary containing missing keys, unexpected keys and error
+                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error
                 messages.
             local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
                 Whether or not to only look at local files (e.g., not try doanloading the model).
diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py
index 8fe86432ed..32f6ba1aee 100644
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -169,7 +169,7 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
         epsilon (:obj:`float`, `optional`, defaults to 1e-7):
             The epsilon paramenter in Adam, which is a small constant for numerical stability.
         amsgrad (:obj:`bool`, `optional`, default to `False`):
-            Wheter to apply AMSGrad varient of this algorithm or not, see
+            Whether to apply AMSGrad varient of this algorithm or not, see
             `On the Convergence of Adam and Beyond <https://arxiv.org/abs/1904.09237>`__.
         weight_decay_rate (:obj:`float`, `optional`, defaults to 0):
             The weight decay to apply.
diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
index cace7a16d1..9edc6380cd 100755
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -1766,7 +1766,7 @@ class QuestionAnsweringPipeline(Pipeline):
 
     def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
         """
-        Take the output of any :obj:`ModelForQuestionAnswering` and will generate probalities for each span to be
+        Take the output of any :obj:`ModelForQuestionAnswering` and will generate probabilities for each span to be
         the actual answer.
 
         In addition, it filters out some unwanted/impossible cases like answer len being greater than
@@ -1807,7 +1807,7 @@ class QuestionAnsweringPipeline(Pipeline):
 
     def span_to_answer(self, text: str, start: int, end: int) -> Dict[str, Union[str, int]]:
         """
-        When decoding from token probalities, this method maps token indexes to actual word in
+        When decoding from token probabilities, this method maps token indexes to actual word in
         the initial context.
 
         Args:
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index b4d8829f4d..937e0ebf88 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -682,7 +682,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
             token_ids_1 (:obj:`List[int]`, `optional`):
                 List of ids of the second sequence.
             already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Wheter or not the token list is already formated with special tokens for the model.
+                Whether or not the token list is already formated with special tokens for the model.
 
         Returns:
             A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
@@ -815,7 +815,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
             you want to reload it using the :meth:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
 
         Args:
-            save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
+            save_directory (:obj:`str`): The path to a directory where the tokenizer will be saved.
 
         Returns:
             A tuple of :obj:`str`: The files saved.
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index d459befbcf..9bd50a9488 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -15,7 +15,7 @@
 """ Base classes common to both the slow and the fast tokenization classes:
     PreTrainedTokenizerBase (host all the user fronting encoding methodes)
     Special token mixing (host the special tokens logic) and
-    BatchEncoding (wrap the dictionnary of output with special method for the Fast tokenizers)
+    BatchEncoding (wrap the dictionary of output with special method for the Fast tokenizers)
 """
 
 import copy
@@ -249,7 +249,7 @@ class BatchEncoding(UserDict):
 
     def tokens(self, batch_index: int = 0) -> List[str]:
         """
-        Return the list of tokens (sub-parts of the input strings after word/subword splitting and before converstion
+        Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion
         to integer indices) at a given batch index (only works for the output of a fast tokenizer).
 
         Args:
@@ -1121,7 +1121,7 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
             return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                 Whether or not to return overflowing token sequences.
             return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Wheter or not to return special tokens mask information.
+                Whether or not to return special tokens mask information.
             return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
                 Whether or not to return :obj:`(char_start, char_end)` for each token.
 
@@ -1153,13 +1153,13 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
             - **num_truncated_tokens** -- Number of tokens truncated (when a :obj:`max_length` is specified and
               :obj:`return_overflowing_tokens=True`).
             - **special_tokens_mask** -- List of 0s and 1s, with 0 specifying added special tokens and 1 specifying
-              regual sequence tokens (when :obj:`add_special_tokens=True` and :obj:`return_special_tokens_mask=True`).
+              regular sequence tokens (when :obj:`add_special_tokens=True` and :obj:`return_special_tokens_mask=True`).
             - **length** -- The length of the inputs (when :obj:`return_length=True`)
 """
 
 INIT_TOKENIZER_DOCSTRING = r"""
     Class attributes (overridden by derived classes)
-        - **vocab_files_names** (:obj:`Dict[str, str]`) -- A ditionary with, as keys, the ``__init__`` keyword name of
+        - **vocab_files_names** (:obj:`Dict[str, str]`) -- A dictionary with, as keys, the ``__init__`` keyword name of
           each vocabulary file required by the model, and as associated values, the filename for saving the associated
           file (string).
         - **pretrained_vocab_files_map** (:obj:`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the
@@ -1170,7 +1170,7 @@ INIT_TOKENIZER_DOCSTRING = r"""
           :obj:`short-cut-names` of the pretrained models, and as associated values, the maximum length of the sequence
           inputs of this model, or :obj:`None` if the model has no maximum input size.
         - **pretrained_init_configuration** (:obj:`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
-          :obj:`short-cut-names` of the pretrained models, and as associated values, a dictionnary of specific
+          :obj:`short-cut-names` of the pretrained models, and as associated values, a dictionary of specific
           arguments to pass to the ``__init__`` method of the tokenizer class for this pretrained model when loading the
           tokenizer with the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`
           method.
@@ -1688,7 +1688,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            modifying :obj:`tokenizer.do_lower_case` after creation).
 
         Args:
-            save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
+            save_directory (:obj:`str`): The path to a directory where the tokenizer will be saved.
 
         Returns:
             A tuple of :obj:`str`: The files saved.
@@ -2383,7 +2383,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
         batch_size = len(encoded_inputs["input_ids"])
         assert all(
             len(v) == batch_size for v in encoded_inputs.values()
-        ), "Some items in the output dictionnary have a different batch size than others."
+        ), "Some items in the output dictionary have a different batch size than others."
 
         if padding_strategy == PaddingStrategy.LONGEST:
             max_length = max(len(inputs) for inputs in encoded_inputs["input_ids"])
@@ -2547,7 +2547,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
             sequence = ids + pair_ids if pair else ids
             token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
 
-        # Build output dictionnary
+        # Build output dictionary
         encoded_inputs["input_ids"] = sequence
         if return_token_type_ids:
             encoded_inputs["token_type_ids"] = token_type_ids
@@ -2819,7 +2819,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
             token_ids_1 (:obj:`List[int]`, `optional`):
                 List of ids of the second sequence.
             already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Wheter or not the token list is already formated with special tokens for the model.
+                Whether or not the token list is already formated with special tokens for the model.
 
         Returns:
             A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 8c00c7ddb3..4ee41c4f97 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -552,7 +552,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
             you want to reload it using the :meth:`~transformers.PreTrainedTokenizerFast.from_pretrained` class method.
 
         Args:
-            save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
+            save_directory (:obj:`str`): The path to a directory where the tokenizer will be saved.
 
         Returns:
             A tuple of :obj:`str`: The files saved.
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 4a7d11d325..3656ee1bc3 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -895,7 +895,7 @@ class Trainer:
                 - the documentation of `tune.run <https://docs.ray.io/en/latest/tune/api_docs/execution.html#tune-run>`__
 
         Returns:
-            :class:`transformers.trainer_utils.BestRun`: All the informations about the best run.
+            :class:`transformers.trainer_utils.BestRun`: All the information about the best run.
         """
         if backend is None:
             backend = default_hp_search_backend()
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 9359a9f17c..04a9b08434 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -94,7 +94,7 @@ class TrainingArguments:
         logging_dir (:obj:`str`, `optional`):
             Tensorboard log directory. Will default to `runs/**CURRENT_DATETIME_HOSTNAME**`.
         logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Wheter to log and evalulate the first :obj:`global_step` or not.
+            Whether to log and evaluate the first :obj:`global_step` or not.
         logging_steps (:obj:`int`, `optional`, defaults to 500):
             Number of update steps between two logs.
         save_steps (:obj:`int`, `optional`, defaults to 500):
@@ -114,7 +114,7 @@ class TrainingArguments:
         local_rank (:obj:`int`, `optional`, defaults to -1):
             During distributed training, the rank of the process.
         tpu_num_cores (:obj:`int`, `optional`):
-            When training on TPU, the mumber of TPU cores (automatically passed by launcher script).
+            When training on TPU, the number of TPU cores (automatically passed by launcher script).
         debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
             When training on TPU, whether to print debug metrics or not.
         dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
@@ -159,7 +159,7 @@ class TrainingArguments:
             Will default to :obj:`"loss"` if unspecified and :obj:`load_best_model_at_end=True` (to use the evaluation
             loss).
 
-            If you set this value, :obj:`greater_is_better` will defaut to :obj:`True`. Don't forget to set it to
+            If you set this value, :obj:`greater_is_better` will default to :obj:`True`. Don't forget to set it to
             :obj:`False` if your metric is better when lower.
         greater_is_better (:obj:`bool`, `optional`)
             Use in conjunction with :obj:`load_best_model_at_end` and :obj:`metric_for_best_model` to specify if better
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index b5c06aa4e0..2a99071666 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -66,7 +66,7 @@ class TFTrainingArguments(TrainingArguments):
         logging_dir (:obj:`str`, `optional`):
             Tensorboard log directory. Will default to `runs/**CURRENT_DATETIME_HOSTNAME**`.
         logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Wheter to log and evalulate the first :obj:`global_step` or not.
+            Whether to log and evaluate the first :obj:`global_step` or not.
         logging_steps (:obj:`int`, `optional`, defaults to 500):
             Number of update steps between two logs.
         save_steps (:obj:`int`, `optional`, defaults to 500):
@@ -86,7 +86,7 @@ class TFTrainingArguments(TrainingArguments):
         local_rank (:obj:`int`, `optional`, defaults to -1):
             During distributed training, the rank of the process.
         tpu_num_cores (:obj:`int`, `optional`):
-            When training on TPU, the mumber of TPU cores (automatically passed by launcher script).
+            When training on TPU, the number of TPU cores (automatically passed by launcher script).
         debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
             Whether to activate the trace to record computation graphs and profiling information or not.
         dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):