From 3222fc645bbb97eac6fd2fea05e4c0eb20d297a7 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Mon, 7 Nov 2022 15:19:04 +0100
Subject: [PATCH] docs: Resolve many typos in the English docs (#20088)

* docs: Fix typo in ONNX parser help: 'tolerence' => 'tolerance'

* docs: Resolve many typos in the English docs

Typos found via 'codespell ./docs/source/en'
---
 docs/source/en/add_tensorflow_model.mdx               |  4 ++--
 docs/source/en/big_models.mdx                         |  4 ++--
 docs/source/en/main_classes/deepspeed.mdx             |  2 +-
 docs/source/en/main_classes/processors.mdx            |  2 +-
 docs/source/en/main_classes/trainer.mdx               |  4 ++--
 docs/source/en/model_doc/bloom.mdx                    |  2 +-
 docs/source/en/model_doc/gpt2.mdx                     |  2 +-
 docs/source/en/model_doc/gptj.mdx                     |  2 +-
 docs/source/en/model_doc/longformer.mdx               |  2 +-
 docs/source/en/model_doc/mobilevit.mdx                |  2 +-
 docs/source/en/model_doc/mt5.mdx                      |  2 +-
 docs/source/en/model_doc/splinter.mdx                 |  2 +-
 docs/source/en/model_doc/t5v1.1.mdx                   |  2 +-
 docs/source/en/model_doc/vision-text-dual-encoder.mdx |  2 +-
 docs/source/en/perf_train_gpu_one.mdx                 |  2 +-
 docs/source/en/serialization.mdx                      |  4 ++--
 docs/source/en/tasks/semantic_segmentation.mdx        |  2 +-
 docs/source/en/testing.mdx                            | 10 +++++-----
 docs/source/it/serialization.mdx                      |  2 +-
 docs/source/pt/serialization.mdx                      |  2 +-
 src/transformers/onnx/__main__.py                     |  2 +-
 21 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/docs/source/en/add_tensorflow_model.mdx b/docs/source/en/add_tensorflow_model.mdx
index 756af3d44e..e145a7d001 100644
--- a/docs/source/en/add_tensorflow_model.mdx
+++ b/docs/source/en/add_tensorflow_model.mdx
@@ -179,7 +179,7 @@ Now it's time to finally start coding. Our suggested starting point is the PyTor
 `modeling_brand_new_bert.py` inside `src/transformers/models/brand_new_bert/` into
 `modeling_tf_brand_new_bert.py`. The goal of this section is to modify the file and update the import structure of
 🤗 Transformers such that you can import `TFBrandNewBert` and
-`TFBrandNewBert.from_pretrained(model_repo, from_pt=True)` sucessfully loads a working TensorFlow *BrandNewBert* model.
+`TFBrandNewBert.from_pretrained(model_repo, from_pt=True)` successfully loads a working TensorFlow *BrandNewBert* model.
 
 Sadly, there is no prescription to convert a PyTorch model into TensorFlow. You can, however, follow our selection of
 tips to make the process as smooth as possible:
@@ -217,7 +217,7 @@ documentation pages. You can complete this part entirely following the patterns
 ([example](https://github.com/huggingface/transformers/pull/18020/files)). Here's a list of the needed manual
 changes:
 - Include all public classes of *BrandNewBert* in `src/transformers/__init__.py`
-- Add *BrandNewBert* classes to the corresponing Auto classes in `src/transformers/models/auto/modeling_tf_auto.py`
+- Add *BrandNewBert* classes to the corresponding Auto classes in `src/transformers/models/auto/modeling_tf_auto.py`
 - Include the modeling file in the documentation test file list in `utils/documentation_tests.txt`
 - Add the lazy loading classes related to *BrandNewBert* in `src/transformers/utils/dummy_tf_objects.py`
 - Update the import structures for the public classes in `src/transformers/models/brand_new_bert/__init__.py`
diff --git a/docs/source/en/big_models.mdx b/docs/source/en/big_models.mdx
index 7f062e703f..971403b62d 100644
--- a/docs/source/en/big_models.mdx
+++ b/docs/source/en/big_models.mdx
@@ -72,7 +72,7 @@ On top of the configuration of the model, we see three different weights files,
 
 The main advantage of doing this for big models is that during step 2 of the workflow shown above, each shard of the checkpoint is loaded after the previous one, capping the memory usage in RAM to the model size plus the size of the biggest shard.
 
-Beind the scenes, the index file is used to determine which keys are in the checkpoint, and where the corresponding weights are stored. We can load that index like any json and get a dictionary:
+Behind the scenes, the index file is used to determine which keys are in the checkpoint, and where the corresponding weights are stored. We can load that index like any json and get a dictionary:
 
 ```py
 >>> import json
@@ -86,7 +86,7 @@ Beind the scenes, the index file is used to determine which keys are in the chec
 dict_keys(['metadata', 'weight_map'])
 ```
 
-The metadata just consists of the total size of the model for now. We plan to add several other informations in the future:
+The metadata just consists of the total size of the model for now. We plan to add other information in the future:
 
 ```py
 >>> index["metadata"]
diff --git a/docs/source/en/main_classes/deepspeed.mdx b/docs/source/en/main_classes/deepspeed.mdx
index 8fb819fee5..7926ddb5c6 100644
--- a/docs/source/en/main_classes/deepspeed.mdx
+++ b/docs/source/en/main_classes/deepspeed.mdx
@@ -1499,7 +1499,7 @@ fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
 
 <Tip>
 
-Note, that once `load_state_dict_from_zero_checkpoint` was run, the `model` will no longer be useable in the
+Note, that once `load_state_dict_from_zero_checkpoint` was run, the `model` will no longer be usable in the
 DeepSpeed context of the same application. i.e. you will need to re-initialize the deepspeed engine, since
 `model.load_state_dict(state_dict)` will remove all the DeepSpeed magic from it. So do this only at the very end
 of the training.
diff --git a/docs/source/en/main_classes/processors.mdx b/docs/source/en/main_classes/processors.mdx
index 5d0d3f8307..d37f8a1a40 100644
--- a/docs/source/en/main_classes/processors.mdx
+++ b/docs/source/en/main_classes/processors.mdx
@@ -112,7 +112,7 @@ Additionally, the following method can be used to convert SQuAD examples into
 [[autodoc]] data.processors.squad.squad_convert_examples_to_features
 
 
-These processors as well as the aforementionned method can be used with files containing the data as well as with the
+These processors as well as the aforementioned method can be used with files containing the data as well as with the
 *tensorflow_datasets* package. Examples are given below.
 
 
diff --git a/docs/source/en/main_classes/trainer.mdx b/docs/source/en/main_classes/trainer.mdx
index ab942a2c1a..a0b914cd40 100644
--- a/docs/source/en/main_classes/trainer.mdx
+++ b/docs/source/en/main_classes/trainer.mdx
@@ -579,7 +579,7 @@ add `--fsdp "full_shard offload auto_wrap"` or `--fsdp "shard_grad_op offload au
   This specifies the transformer layer class name (case-sensitive) to wrap ,e.g, `BertLayer`, `GPTJBlock`, `T5Block` ....
   This is important because submodules that share weights (e.g., embedding layer) should not end up in different FSDP wrapped units. 
   Using this policy, wrapping happens for each block containing Multi-Head Attention followed by couple of MLP layers. 
-  Remaining layers including the shared embeddings are conviniently wrapped in same outermost FSDP unit.
+  Remaining layers including the shared embeddings are conveniently wrapped in same outermost FSDP unit.
   Therefore, use this for transformer based models.
   - For size based auto wrap policy, please add `--fsdp_min_num_params <number>` to command line arguments.
   It specifies FSDP's minimum number of parameters for auto wrapping.
@@ -620,7 +620,7 @@ please follow this nice medium article [GPU-Acceleration Comes to PyTorch on M1
 
 **Usage**:
 User has to just pass `--use_mps_device` argument. 
-For example, you can run the offical Glue text classififcation task (from the root folder) using Apple Silicon GPU with below command:
+For example, you can run the official Glue text classififcation task (from the root folder) using Apple Silicon GPU with below command:
 
 ```bash
 export TASK_NAME=mrpc
diff --git a/docs/source/en/model_doc/bloom.mdx b/docs/source/en/model_doc/bloom.mdx
index 1672da43e5..a3a2aa81d7 100644
--- a/docs/source/en/model_doc/bloom.mdx
+++ b/docs/source/en/model_doc/bloom.mdx
@@ -32,7 +32,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 <PipelineTag pipeline="text-generation"/>
 
-- [`BloomForCausalLM`] is suppported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
+- [`BloomForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
 
 ⚡️ Inference
 - A blog on [Optimization story: Bloom inference](https://huggingface.co/blog/bloom-inference-optimization).
diff --git a/docs/source/en/model_doc/gpt2.mdx b/docs/source/en/model_doc/gpt2.mdx
index 064af2ab58..475b080136 100644
--- a/docs/source/en/model_doc/gpt2.mdx
+++ b/docs/source/en/model_doc/gpt2.mdx
@@ -61,7 +61,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 - A notebook on how to [finetune GPT2 to generate lyrics in the style of your favorite artist](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb). 🌎
 - A notebook on how to [finetune GPT2 to generate tweets in the style of your favorite Twitter user](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb). 🌎
 - [Causal language modeling](https://huggingface.co/course/en/chapter7/6?fw=pt#training-a-causal-language-model-from-scratch) chapter of the 🤗 Hugging Face Course.
-- [`GPT2LMHeadModel`] is suppported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling), [text generation example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation), and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
+- [`GPT2LMHeadModel`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling), [text generation example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation), and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
 - [`TFGPT2LMHeadModel`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_clmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
 - [`FlaxGPT2LMHeadModel`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#causal-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/causal_language_modeling_flax.ipynb).
 
diff --git a/docs/source/en/model_doc/gptj.mdx b/docs/source/en/model_doc/gptj.mdx
index 62bd224746..d549e0ad6d 100644
--- a/docs/source/en/model_doc/gptj.mdx
+++ b/docs/source/en/model_doc/gptj.mdx
@@ -47,7 +47,7 @@ Tips:
   that could be found [here](https://github.com/kingoflolz/mesh-transformer-jax/blob/master/howto_finetune.md)
 
 - Although the embedding matrix has a size of 50400, only 50257 entries are used by the GPT-2 tokenizer. These extra
-  tokens are added for the sake of efficiency on TPUs. To avoid the mis-match between embedding matrix size and vocab
+  tokens are added for the sake of efficiency on TPUs. To avoid the mismatch between embedding matrix size and vocab
   size, the tokenizer for [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) contains 143 extra tokens
   `<|extratoken_1|>... <|extratoken_143|>`, so the `vocab_size` of tokenizer also becomes 50400.
 
diff --git a/docs/source/en/model_doc/longformer.mdx b/docs/source/en/model_doc/longformer.mdx
index 2ebc63b2be..3b639281cf 100644
--- a/docs/source/en/model_doc/longformer.mdx
+++ b/docs/source/en/model_doc/longformer.mdx
@@ -40,7 +40,7 @@ This model was contributed by [beltagy](https://huggingface.co/beltagy). The Aut
 
 Longformer self attention employs self attention on both a "local" context and a "global" context. Most tokens only
 attend "locally" to each other meaning that each token attends to its \\(\frac{1}{2} w\\) previous tokens and
-\\(\frac{1}{2} w\\) succeding tokens with \\(w\\) being the window length as defined in
+\\(\frac{1}{2} w\\) succeeding tokens with \\(w\\) being the window length as defined in
 `config.attention_window`. Note that `config.attention_window` can be of type `List` to define a
 different \\(w\\) for each layer. A selected few tokens attend "globally" to all other tokens, as it is
 conventionally done for all tokens in `BertSelfAttention`.
diff --git a/docs/source/en/model_doc/mobilevit.mdx b/docs/source/en/model_doc/mobilevit.mdx
index e0799d2962..818f8432af 100644
--- a/docs/source/en/model_doc/mobilevit.mdx
+++ b/docs/source/en/model_doc/mobilevit.mdx
@@ -26,7 +26,7 @@ Tips:
 - One can use [`MobileViTFeatureExtractor`] to prepare images for the model. Note that if you do your own preprocessing, the pretrained checkpoints expect images to be in BGR pixel order (not RGB).
 - The available image classification checkpoints are pre-trained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k) (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes).
 - The segmentation model uses a [DeepLabV3](https://arxiv.org/abs/1706.05587) head. The available semantic segmentation checkpoints are pre-trained on [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/). 
-- As the name suggests MobileViT was desgined to be performant and efficient on mobile phones. The TensorFlow versions of the MobileViT models are fully compatible with [TensorFlow Lite](https://www.tensorflow.org/lite). 
+- As the name suggests MobileViT was designed to be performant and efficient on mobile phones. The TensorFlow versions of the MobileViT models are fully compatible with [TensorFlow Lite](https://www.tensorflow.org/lite). 
 
   You can use the following code to convert a MobileViT checkpoint (be it image classification or semantic segmentation) to generate a 
   TensorFlow Lite model:
diff --git a/docs/source/en/model_doc/mt5.mdx b/docs/source/en/model_doc/mt5.mdx
index a00003da46..dc08ed55a1 100644
--- a/docs/source/en/model_doc/mt5.mdx
+++ b/docs/source/en/model_doc/mt5.mdx
@@ -28,7 +28,7 @@ generative model chooses to (partially) translate its prediction into the wrong
 checkpoints used in this work are publicly available.*
 
 Note: mT5 was only pre-trained on [mC4](https://huggingface.co/datasets/mc4) excluding any supervised training.
-Therefore, this model has to be fine-tuned before it is useable on a downstream task, unlike the original T5 model.
+Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5 model.
 Since mT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
 fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
 
diff --git a/docs/source/en/model_doc/splinter.mdx b/docs/source/en/model_doc/splinter.mdx
index 9623ec7501..55e5f61b8d 100644
--- a/docs/source/en/model_doc/splinter.mdx
+++ b/docs/source/en/model_doc/splinter.mdx
@@ -41,7 +41,7 @@ Tips:
 - If you plan on using Splinter outside *run_qa.py*, please keep in mind the question token - it might be important for
   the success of your model, especially in a few-shot setting.
 - Please note there are two different checkpoints for each size of Splinter. Both are basically the same, except that
-  one also has the pretrained wights of the QASS layer (*tau/splinter-base-qass* and *tau/splinter-large-qass*) and one
+  one also has the pretrained weights of the QASS layer (*tau/splinter-base-qass* and *tau/splinter-large-qass*) and one
   doesn't (*tau/splinter-base* and *tau/splinter-large*). This is done to support randomly initializing this layer at
   fine-tuning, as it is shown to yield better results for some cases in the paper.
 
diff --git a/docs/source/en/model_doc/t5v1.1.mdx b/docs/source/en/model_doc/t5v1.1.mdx
index b15188961d..a5b64f77dc 100644
--- a/docs/source/en/model_doc/t5v1.1.mdx
+++ b/docs/source/en/model_doc/t5v1.1.mdx
@@ -39,7 +39,7 @@ T5 Version 1.1 includes the following improvements compared to the original T5 m
   `num_heads` and `d_ff`.
 
 Note: T5 Version 1.1 was only pre-trained on [C4](https://huggingface.co/datasets/c4) excluding any supervised
-training. Therefore, this model has to be fine-tuned before it is useable on a downstream task, unlike the original T5
+training. Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5
 model. Since t5v1.1 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
 fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
 
diff --git a/docs/source/en/model_doc/vision-text-dual-encoder.mdx b/docs/source/en/model_doc/vision-text-dual-encoder.mdx
index fcc4f38288..c7ee59d77a 100644
--- a/docs/source/en/model_doc/vision-text-dual-encoder.mdx
+++ b/docs/source/en/model_doc/vision-text-dual-encoder.mdx
@@ -21,7 +21,7 @@ downstream task. This model can be used to align the vision-text embeddings usin
 training and then can be used for zero-shot vision tasks such image-classification or retrieval.
 
 In [LiT: Zero-Shot Transfer with Locked-image Text Tuning](https://arxiv.org/abs/2111.07991) it is shown how
-leveraging pre-trained (locked/frozen) image and text model for contrastive learning yields significant improvment on
+leveraging pre-trained (locked/frozen) image and text model for contrastive learning yields significant improvement on
 new zero-shot vision tasks such as image classification or retrieval.
 
 ## VisionTextDualEncoderConfig
diff --git a/docs/source/en/perf_train_gpu_one.mdx b/docs/source/en/perf_train_gpu_one.mdx
index 075db67a58..9636bf9c2f 100644
--- a/docs/source/en/perf_train_gpu_one.mdx
+++ b/docs/source/en/perf_train_gpu_one.mdx
@@ -609,7 +609,7 @@ for step, batch in enumerate(dataloader, start=1):
         optimizer.zero_grad()
 ```
 
-First we wrap the dataset in a [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). Then we can enable gradient checkpointing by calling the model's [`~PreTrainedModel.gradient_checkpointing_enable`] method. When we initialize the [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator) we can specifiy if we want to use mixed precision training and it will take care of it for us in the [`prepare`] call. During the [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare) call the dataloader will also be distributed across workers should we use multiple GPUs. We use the same 8-bit optimizer from the earlier experiments.
+First we wrap the dataset in a [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). Then we can enable gradient checkpointing by calling the model's [`~PreTrainedModel.gradient_checkpointing_enable`] method. When we initialize the [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator) we can specify if we want to use mixed precision training and it will take care of it for us in the [`prepare`] call. During the [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare) call the dataloader will also be distributed across workers should we use multiple GPUs. We use the same 8-bit optimizer from the earlier experiments.
 
 Finally, we can write the main training loop. Note that the `backward` call is handled by 🤗 Accelerate. We can also see how gradient accumulation works: we normalize the loss so we get the average at the end of accumulation and once we have enough steps we run the optimization. Now the question is: does this use the same amount of memory as the previous steps? Let's check:
 
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index 1cbc1237f2..f18c434a6f 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -137,7 +137,7 @@ optional arguments:
   --feature {causal-lm, ...}
                         The type of features to export the model with.
   --opset OPSET         ONNX opset version to export the model with.
-  --atol ATOL           Absolute difference tolerence when validating the model.
+  --atol ATOL           Absolute difference tolerance when validating the model.
 ```
 
 Exporting a checkpoint using a ready-made configuration can be done as follows:
@@ -415,7 +415,7 @@ OrderedDict([('logits', {0: 'batch'})])
 <Tip>
 
 All of the base properties and methods associated with [`~onnx.config.OnnxConfig`] and
-the other configuration classes can be overriden if needed. Check out [`BartOnnxConfig`]
+the other configuration classes can be overridden if needed. Check out [`BartOnnxConfig`]
 for an advanced example.
 
 </Tip>
diff --git a/docs/source/en/tasks/semantic_segmentation.mdx b/docs/source/en/tasks/semantic_segmentation.mdx
index 3d1b5ef453..941f0c7f8a 100644
--- a/docs/source/en/tasks/semantic_segmentation.mdx
+++ b/docs/source/en/tasks/semantic_segmentation.mdx
@@ -282,5 +282,5 @@ To visualize the results, load the [dataset color palette](https://github.com/te
 ```
 
 <div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/semantic-seg-preds.png" alt="Image of bedroom overlayed with segmentation map"/>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/semantic-seg-preds.png" alt="Image of bedroom overlaid with segmentation map"/>
 </div>
\ No newline at end of file
diff --git a/docs/source/en/testing.mdx b/docs/source/en/testing.mdx
index d702612834..2cf42205a6 100644
--- a/docs/source/en/testing.mdx
+++ b/docs/source/en/testing.mdx
@@ -203,17 +203,17 @@ Example:
     ```"""
 
 ```
-3 steps are required to debug the docstring examples : 
-1. In order to properly run the test, **an extra line has to be added** at the end of the docstring. This can be automatically done on any file using : 
+3 steps are required to debug the docstring examples: 
+1. In order to properly run the test, **an extra line has to be added** at the end of the docstring. This can be automatically done on any file using: 
 ```bash 
 python utils/prepare_for_doc_test.py <path_to_file_or_dir>
 ```
 
-2. Then, you can use the following line to automatically test every docstring example in the desired file : 
+2. Then, you can use the following line to automatically test every docstring example in the desired file: 
 ```bash 
 pytest --doctest-modules <path_to_file_or_dir>
 ```
-3. Once you are done debugging, you need to remove the extra line added in step **1.** by running the follwing : 
+3. Once you are done debugging, you need to remove the extra line added in step **1.** by running the following: 
 ```bash 
 python utils/prepare_for_doc_test.py <path_to_file_or_dir> --remove_new_line
 ```
@@ -1161,7 +1161,7 @@ This helper method creates a copy of the `os.environ` object, so the original re
 
 ### Getting reproducible results
 
-In some situations you may want to remove randomness for your tests. To get identical reproducable results set, you
+In some situations you may want to remove randomness for your tests. To get identical reproducible results set, you
 will need to fix the seed:
 
 ```python
diff --git a/docs/source/it/serialization.mdx b/docs/source/it/serialization.mdx
index 9190e22e44..1dde00f429 100644
--- a/docs/source/it/serialization.mdx
+++ b/docs/source/it/serialization.mdx
@@ -112,7 +112,7 @@ optional arguments:
   --feature {causal-lm, ...}
                         The type of features to export the model with.
   --opset OPSET         ONNX opset version to export the model with.
-  --atol ATOL           Absolute difference tolerence when validating the model.
+  --atol ATOL           Absolute difference tolerance when validating the model.
 ```
 
 L'esportazione di un checkpoint utilizzando una configurazione già pronta può essere eseguita come segue:
diff --git a/docs/source/pt/serialization.mdx b/docs/source/pt/serialization.mdx
index 95abd6c638..2a01640be4 100644
--- a/docs/source/pt/serialization.mdx
+++ b/docs/source/pt/serialization.mdx
@@ -135,7 +135,7 @@ optional arguments:
   --feature {causal-lm, ...}
                         The type of features to export the model with.
   --opset OPSET         ONNX opset version to export the model with.
-  --atol ATOL           Absolute difference tolerence when validating the model.
+  --atol ATOL           Absolute difference tolerance when validating the model.
 ```
 
 A exportação de um checkpoint usando uma configuração pronta pode ser feita da seguinte forma:
diff --git a/src/transformers/onnx/__main__.py b/src/transformers/onnx/__main__.py
index b84e12edbb..20b0333d45 100644
--- a/src/transformers/onnx/__main__.py
+++ b/src/transformers/onnx/__main__.py
@@ -38,7 +38,7 @@ def main():
     )
     parser.add_argument("--opset", type=int, default=None, help="ONNX opset version to export the model with.")
     parser.add_argument(
-        "--atol", type=float, default=None, help="Absolute difference tolerence when validating the model."
+        "--atol", type=float, default=None, help="Absolute difference tolerance when validating the model."
     )
     parser.add_argument(
         "--framework",