chore: Fix typos in docs and examples (#36524)

Fix typos in docs and examples Signed-off-by: co63oc <co63oc@users.noreply.github.com>
2025-03-04 21:47:41 +08:00
parent 84f0186e89
commit 37508816d6
38 changed files with 50 additions and 50 deletions
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@@ -265,7 +265,7 @@ class FlaxDataCollatorSpeechSeq2SeqWithPadding:
    Data collator that will dynamically pad the inputs received.
    Args:
        processor ([`Wav2Vec2Processor`])
-            The processor used for proccessing the data.
+            The processor used for processing the data.
        decoder_start_token_id (:obj: `int`)
            The begin-of-sentence of the decoder.
        input_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
--- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
@@ -296,7 +296,7 @@ class DataCollatorForWav2Vec2Pretraining:
            The Wav2Vec2 model used for pretraining. The data collator needs to have access
            to config and ``_get_feat_extract_output_lengths`` function for correct padding.
        feature_extractor (:class:`~transformers.Wav2Vec2FeatureExtractor`):
-            The processor used for proccessing the data.
+            The processor used for processing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
@@ -445,7 +445,7 @@ def main():
    accelerator.wait_for_everyone()

    # 1. Download and create train, validation dataset
-    # We load all dataset configuration and datset split pairs passed in
+    # We load all dataset configuration and dataset split pairs passed in
    # ``args.dataset_config_names`` and ``args.dataset_split_names``
    datasets_splits = []
    for dataset_config_name, train_split_name in zip(args.dataset_config_names, args.dataset_split_names):
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -292,7 +292,7 @@ class DataCollatorCTCWithPadding:
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.AutoProcessor`)
-            The processor used for proccessing the data.
+            The processor used for processing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@@ -275,7 +275,7 @@ class DataCollatorCTCWithPadding:
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.AutoProcessor`)
-            The processor used for proccessing the data.
+            The processor used for processing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
@@ -559,7 +559,7 @@ def main():
                )

                # if we doing adapter language training, save
-                # vocab with adpter language
+                # vocab with adapter language
                if data_args.target_language is not None:
                    vocab_dict[data_args.target_language] = lang_dict

--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -429,7 +429,7 @@ def main():
    if is_regression:
        label_list = None
        num_labels = 1
-        # regession requires float as label type, let's cast it if needed
+        # regression requires float as label type, let's cast it if needed
        for split in raw_datasets.keys():
            if raw_datasets[split].features["label"].dtype not in ["float32", "float64"]:
                logger.warning(
--- a/examples/pytorch/text-generation/README.md
+++ b/examples/pytorch/text-generation/README.md
@@ -19,7 +19,7 @@ limitations under the License.
 Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-generation/run_generation.py).

 Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, GPT-J, Transformer-XL, XLNet, CTRL, BLOOM, LLAMA, OPT.
-A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
+A similar script is used for our official demo [Write With Transformer](https://transformer.huggingface.co), where you
 can try out the different models available in the library.

 Example usage:
--- a/examples/pytorch/token-classification/README.md
+++ b/examples/pytorch/token-classification/README.md
@@ -19,7 +19,7 @@ limitations under the License.
 ## PyTorch version

 Fine-tuning the library models for token classification task such as Named Entity Recognition (NER), Parts-of-speech
-tagging (POS) or phrase extraction (CHUNKS). The main scrip `run_ner.py` leverages the 🤗 Datasets library and the Trainer API. You can easily
+tagging (POS) or phrase extraction (CHUNKS). The main script `run_ner.py` leverages the 🤗 Datasets library and the Trainer API. You can easily
 customize it to your needs if you need extra processing on your datasets.

 It will either run on a datasets hosted on our [hub](https://huggingface.co/datasets) or with your own text files for
--- a/examples/research_projects/bertabs/configuration_bertabs.py
+++ b/examples/research_projects/bertabs/configuration_bertabs.py
@@ -37,7 +37,7 @@ class BertAbsConfig(PretrainedConfig):
        max_pos: int
            The maximum sequence length that this model will be used with.
        enc_layer: int
-            The numner of hidden layers in the Transformer encoder.
+            The number of hidden layers in the Transformer encoder.
        enc_hidden_size: int
            The size of the encoder's layers.
        enc_heads: int
@@ -49,7 +49,7 @@ class BertAbsConfig(PretrainedConfig):
            embeddings, layers, pooler and also the attention probabilities in
            the encoder.
        dec_layer: int
-            The numner of hidden layers in the decoder.
+            The number of hidden layers in the decoder.
        dec_hidden_size: int
            The size of the decoder's layers.
        dec_heads: int
--- a/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py
+++ b/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py
@@ -130,7 +130,7 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
    mask_tgt = decoder_attention_mask = None
    mask_cls = None

-    # The original model does not apply the geneator layer immediatly but rather in
+    # The original model does not apply the generator layer immediatly but rather in
    # the beam search (where it combines softmax + linear layer). Since we already
    # apply the softmax in our generation process we only apply the linear layer here.
    # We make sure that the outputs of the full stack are identical
@@ -143,9 +143,9 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
    output_converted_generator = new_model.generator(output_converted_model)

    maximum_absolute_difference = torch.max(torch.abs(output_converted_model - output_original_model)).item()
-    print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference))
+    print("Maximum absolute difference between weights: {:.2f}".format(maximum_absolute_difference))
    maximum_absolute_difference = torch.max(torch.abs(output_converted_generator - output_original_generator)).item()
-    print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference))
+    print("Maximum absolute difference between weights: {:.2f}".format(maximum_absolute_difference))

    are_identical = torch.allclose(output_converted_model, output_original_model, atol=1e-3)
    if are_identical:
--- a/examples/research_projects/bertabs/modeling_bertabs.py
+++ b/examples/research_projects/bertabs/modeling_bertabs.py
@@ -390,7 +390,7 @@ class MultiHeadedAttention(nn.Module):
    :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.

    Similar to standard `dot` attention but uses
-    multiple attention distributions simulataneously
+    multiple attention distributions simultaneously
    to select relevant items.

    .. mermaid::
--- a/examples/research_projects/bertabs/run_summarization.py
+++ b/examples/research_projects/bertabs/run_summarization.py
@@ -260,7 +260,7 @@ def main():
        default=None,
        type=str,
        required=False,
-        help="The folder in wich the summaries should be written. Defaults to the folder where the documents are",
+        help="The folder in which the summaries should be written. Defaults to the folder where the documents are",
    )
    parser.add_argument(
        "--compute_rouge",
@@ -315,7 +315,7 @@ def main():
    )
    args = parser.parse_args()

-    # Select device (distibuted not available)
+    # Select device (distributed not available)
    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    # Check the existence of directories
--- a/examples/research_projects/codeparrot/scripts/codeparrot_training.py
+++ b/examples/research_projects/codeparrot/scripts/codeparrot_training.py
@@ -24,7 +24,7 @@ class ConstantLengthDataset(IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
-            tokenizer (Tokenizer): The processor used for proccessing the data.
+            tokenizer (Tokenizer): The processor used for processing the data.
            dataset (dataset.Dataset): Dataset with text files.
            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
            seq_length (int): Length of token sequences to return.
--- a/examples/research_projects/codeparrot/scripts/preprocessing.py
+++ b/examples/research_projects/codeparrot/scripts/preprocessing.py
@@ -84,7 +84,7 @@ def is_config_or_test(example, scan_width=5, coeff=0.05):


 def has_no_keywords(example):
-    """Check if a python file has none of the keywords for: funcion, class, for loop, while loop."""
+    """Check if a python file has none of the keywords for: function, class, for loop, while loop."""
    keywords = ["def ", "class ", "for ", "while "]
    lines = example["content"].splitlines()
    for line in lines:
--- a/examples/research_projects/performer/modeling_flax_performer_utils.py
+++ b/examples/research_projects/performer/modeling_flax_performer_utils.py
@@ -252,7 +252,7 @@ def make_fast_generalized_attention(
    unidirectional=False,
    lax_scan_unroll=1,
 ):
-    """Construct a fast generalized attention menthod."""
+    """Construct a fast generalized attention method."""
    logging.info("Fast generalized attention.: %s features and renormalize=%s", nb_features, renormalize_attention)
    if features_type == "ortho":
        matrix_creator = functools.partial(GaussianOrthogonalRandomMatrix, nb_features, qkv_dim, scaling=False)
--- a/examples/research_projects/rag-end2end-retriever/README.md
+++ b/examples/research_projects/rag-end2end-retriever/README.md
@@ -11,7 +11,7 @@ Please read the [accompanying blog post](https://shamanesiri.medium.com/how-to-f
 The original RAG code has also been modified to work with the latest versions of pytorch lightning (version 1.2.10) and RAY (version 1.3.0). All other implementation details remain the same as the [original RAG code](https://github.com/huggingface/transformers/tree/main/examples/research_projects/rag).
 Read more about RAG  at https://arxiv.org/abs/2005.11401.

-This code can be modified to experiment with other research on retrival augmented models which include training of the retriever (e.g. [REALM](https://arxiv.org/abs/2002.08909) and [MARGE](https://arxiv.org/abs/2006.15020)).
+This code can be modified to experiment with other research on retrieval augmented models which include training of the retriever (e.g. [REALM](https://arxiv.org/abs/2002.08909) and [MARGE](https://arxiv.org/abs/2006.15020)).

 To start training, use the bash script (finetune_rag_ray_end2end.sh) in this folder. This script also includes descriptions on each command-line argument used.

--- a/examples/research_projects/rag-end2end-retriever/lightning_base.py
+++ b/examples/research_projects/rag-end2end-retriever/lightning_base.py
@@ -134,7 +134,7 @@ class BaseTransformer(pl.LightningModule):
            {
                "params": [
                    p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)
-                ],  # check this named paramters
+                ],  # check this named parameters
                "weight_decay": self.hparams.weight_decay,
            },
            {
@@ -279,7 +279,7 @@ class InitCallback(pl.Callback):


 class CheckParamCallback(pl.Callback):
-    # check whether new added model paramters are differentiable
+    # check whether new added model parameters are differentiable
    def on_after_backward(self, trainer, pl_module):
        # print(pl_module.model.rag)
        for name, param in pl_module.model.rag.named_parameters():
--- a/examples/research_projects/rag/README.md
+++ b/examples/research_projects/rag/README.md
@@ -98,7 +98,7 @@ Our evaluation script enables two modes of evaluation (controlled by the `eval_m

 The evaluation script expects paths to two files:
 - `evaluation_set` - a path to a file specifying the evaluation dataset, a single input per line.
- `gold_data_path` - a path to a file contaning ground truth answers for datapoints from the `evaluation_set`, a single output per line. Check below for expected formats of the gold data files.
+- `gold_data_path` - a path to a file containing ground truth answers for datapoints from the `evaluation_set`, a single output per line. Check below for expected formats of the gold data files.


 ## Retrieval evaluation
--- a/examples/research_projects/rag/distributed_pytorch_retriever.py
+++ b/examples/research_projects/rag/distributed_pytorch_retriever.py
@@ -70,7 +70,7 @@ class RagPyTorchDistributedRetriever(RagRetriever):
            logger.info("dist not initialized / main")
            self.index.init_index()

-        # all processes wait untill the retriever is initialized by the main process
+        # all processes wait until the retriever is initialized by the main process
        if dist.is_initialized():
            torch.distributed.barrier(group=self.process_group)

--- a/examples/research_projects/rag/finetune_rag.py
+++ b/examples/research_projects/rag/finetune_rag.py
@@ -458,7 +458,7 @@ class GenerativeQAModule(BaseTransformer):
            default=None,
            help=(
                "Name of the index to use: 'hf' for a canonical dataset from the datasets library (default), 'custom'"
-                " for a local index, or 'legacy' for the orignal one)"
+                " for a local index, or 'legacy' for the original one)"
            ),
        )
        parser.add_argument(
--- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
+++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
@@ -266,7 +266,7 @@ class DataCollatorCTCWithPadding:
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.AutoProcessor`)
-            The processor used for proccessing the data.
+            The processor used for processing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
--- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
+++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
@@ -257,7 +257,7 @@ class DataCollatorCTCWithPadding:
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.AutoProcessor`)
-            The processor used for proccessing the data.
+            The processor used for processing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
--- a/examples/research_projects/wav2vec2/run_asr.py
+++ b/examples/research_projects/wav2vec2/run_asr.py
@@ -226,7 +226,7 @@ class DataCollatorCTCWithPadding:
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
-            The processor used for proccessing the data.
+            The processor used for processing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
--- a/examples/research_projects/wav2vec2/run_common_voice.py
+++ b/examples/research_projects/wav2vec2/run_common_voice.py
@@ -145,7 +145,7 @@ class DataCollatorCTCWithPadding:
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
-            The processor used for proccessing the data.
+            The processor used for processing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
--- a/examples/research_projects/wav2vec2/run_pretrain.py
+++ b/examples/research_projects/wav2vec2/run_pretrain.py
@@ -142,7 +142,7 @@ class DataCollatorForWav2Vec2Pretraining:
            The Wav2Vec2 model used for pretraining. The data collator needs to have access
            to config and ``_get_feat_extract_output_lengths`` function for correct padding.
        feature_extractor (:class:`~transformers.Wav2Vec2FeatureExtractor`):
-            The processor used for proccessing the data.
+            The processor used for processing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among: