From 214062201e85276720d86a858e6f3b745e64c6ec Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 25 Apr 2025 20:47:25 +0800
Subject: [PATCH] Fix typos in strings and comments (#37784)

* Fix typos in strings and comments

* Fix
---
 src/transformers/generation/streamers.py      |  4 +-
 src/transformers/generation/tf_utils.py       |  2 +-
 src/transformers/generation/utils.py          |  4 +-
 src/transformers/generation/watermarking.py   |  2 +-
 src/transformers/integrations/aqlm.py         |  2 +-
 src/transformers/integrations/awq.py          |  2 +-
 src/transformers/integrations/bitnet.py       |  2 +-
 src/transformers/integrations/bitsandbytes.py |  6 +--
 src/transformers/integrations/eetq.py         |  2 +-
 src/transformers/integrations/fbgemm_fp8.py   |  4 +-
 src/transformers/integrations/ggml.py         |  2 +-
 src/transformers/integrations/higgs.py        |  2 +-
 .../integrations/integration_utils.py         |  2 +-
 src/transformers/integrations/peft.py         |  2 +-
 src/transformers/integrations/quanto.py       |  2 +-
 .../integrations/tensor_parallel.py           |  8 ++--
 src/transformers/integrations/vptq.py         |  2 +-
 src/transformers/loss/loss_rt_detr.py         |  2 +-
 src/transformers/modeling_utils.py            |  2 +-
 .../models/bamba/modeling_bamba.py            |  4 +-
 .../models/bamba/modular_bamba.py             |  4 +-
 src/transformers/models/bark/modeling_bark.py |  6 +--
 ...ping_original_tf2_checkpoint_to_pytorch.py |  4 +-
 .../models/bert/tokenization_bert.py          |  2 +-
 .../tokenization_bert_japanese.py             |  4 +-
 .../models/big_bird/modeling_big_bird.py      |  2 +-
 .../models/big_bird/modeling_flax_big_bird.py |  4 +-
 .../modeling_bigbird_pegasus.py               |  2 +-
 src/transformers/models/blip/modeling_blip.py |  6 +--
 .../models/blip/modeling_tf_blip.py           |  6 +--
 ...rt_bloom_original_checkpoint_to_pytorch.py |  8 ++--
 .../camembert/tokenization_camembert.py       |  2 +-
 .../convert_chameleon_weights_to_hf.py        |  2 +-
 .../models/chameleon/modeling_chameleon.py    |  2 +-
 src/transformers/models/clap/modeling_clap.py | 10 ++---
 .../models/clvp/feature_extraction_clvp.py    |  4 +-
 .../modeling_conditional_detr.py              | 12 ++---
 .../models/convbert/tokenization_convbert.py  |  2 +-
 .../models/cpmant/modeling_cpmant.py          |  2 +-
 .../models/dab_detr/modeling_dab_detr.py      |  6 +--
 .../data2vec/configuration_data2vec_audio.py  |  4 +-
 .../data2vec/modeling_tf_data2vec_vision.py   |  2 +-
 .../configuration_decision_transformer.py     |  2 +-
 .../modeling_decision_transformer.py          |  2 +-
 .../modeling_deformable_detr.py               |  6 +--
 .../models/deprecated/deta/modeling_deta.py   |  8 ++--
 ..._original_pytorch_checkpoint_to_pytorch.py |  4 +-
 ...convert_gptsan_tf_checkpoint_to_pytorch.py |  2 +-
 .../modeling_gptsan_japanese.py               | 10 ++---
 .../tokenization_gptsan_japanese.py           |  4 +-
 .../graphormer/modeling_graphormer.py         |  2 +-
 .../deprecated/jukebox/convert_jukebox.py     |  2 +-
 .../deprecated/jukebox/modeling_jukebox.py    | 44 +++++++++----------
 .../models/deprecated/mctct/modeling_mctct.py |  2 +-
 .../models/deprecated/mega/modeling_mega.py   |  8 ++--
 .../deprecated/realm/retrieval_realm.py       |  2 +-
 .../deprecated/realm/tokenization_realm.py    |  2 +-
 .../retribert/tokenization_retribert.py       |  2 +-
 .../transfo_xl/configuration_transfo_xl.py    |  2 +-
 .../tvlt/feature_extraction_tvlt.py           |  4 +-
 .../deprecated/tvlt/image_processing_tvlt.py  |  4 +-
 .../deprecated/van/convert_van_to_pytorch.py  |  2 +-
 .../convert_depth_anything_to_hf.py           |  2 +-
 .../models/depth_pro/modeling_depth_pro.py    |  4 +-
 src/transformers/models/detr/modeling_detr.py | 12 ++---
 .../distilbert/tokenization_distilbert.py     |  2 +-
 .../models/dpt/convert_dinov2_depth_to_hf.py  |  2 +-
 .../models/dpt/convert_dpt_beit_to_hf.py      |  2 +-
 .../models/dpt/image_processing_dpt.py        | 14 +++---
 .../efficientnet/modeling_efficientnet.py     |  2 +-
 .../models/electra/modeling_flax_electra.py   |  2 +-
 .../models/electra/tokenization_electra.py    |  2 +-
 src/transformers/models/emu3/modeling_emu3.py |  2 +-
 src/transformers/models/emu3/modular_emu3.py  |  2 +-
 .../models/encodec/configuration_encodec.py   |  2 +-
 .../models/encodec/modeling_encodec.py        |  2 +-
 .../configuration_encoder_decoder.py          |  2 +-
 .../modeling_flax_encoder_decoder.py          |  2 +-
 .../modeling_tf_encoder_decoder.py            |  2 +-
 .../models/flava/modeling_flava.py            |  2 +-
 .../models/funnel/tokenization_funnel.py      |  2 +-
 src/transformers/models/gpt2/modeling_gpt2.py |  2 +-
 .../tokenization_gpt_neox_japanese.py         |  2 +-
 .../granitemoe/configuration_granitemoe.py    |  4 +-
 .../models/granitemoe/modeling_granitemoe.py  |  5 ++-
 .../configuration_granitemoeshared.py         |  4 +-
 .../modeling_granitemoeshared.py              |  5 ++-
 .../grounding_dino/modeling_grounding_dino.py | 10 ++---
 .../models/hubert/configuration_hubert.py     |  4 +-
 .../models/idefics/modeling_idefics.py        |  4 +-
 .../models/idefics/processing_idefics.py      |  2 +-
 .../models/jamba/modeling_jamba.py            |  4 +-
 .../models/jetmoe/configuration_jetmoe.py     |  2 +-
 .../models/jetmoe/modeling_jetmoe.py          |  2 +-
 .../models/layoutlm/tokenization_layoutlm.py  |  2 +-
 .../layoutlmv2/tokenization_layoutlmv2.py     |  2 +-
 .../models/layoutlmv3/modeling_layoutlmv3.py  |  2 +-
 src/transformers/models/led/modeling_led.py   |  2 +-
 .../llama4/convert_llama4_weights_to_hf.py    |  4 +-
 .../models/llama4/modeling_llama4.py          |  4 +-
 .../llava/convert_llava_weights_to_hf.py      |  4 +-
 .../models/llava/processing_llava.py          |  2 +-
 .../llava_next/image_processing_llava_next.py |  2 +-
 .../llava_next/processing_llava_next.py       |  2 +-
 .../processing_llava_next_video.py            |  2 +-
 .../convert_llava_onevision_weights_to_hf.py  |  2 +-
 .../image_processing_llava_onevision.py       |  4 +-
 .../processing_llava_onevision.py             |  2 +-
 .../models/longt5/configuration_longt5.py     |  2 +-
 .../convert_longt5x_checkpoint_to_flax.py     |  2 +-
 .../models/longt5/modeling_flax_longt5.py     |  2 +-
 .../models/longt5/modeling_longt5.py          |  2 +-
 .../models/lxmert/configuration_lxmert.py     |  2 +-
 .../models/lxmert/modeling_lxmert.py          |  4 +-
 .../models/lxmert/modeling_tf_lxmert.py       |  4 +-
 .../models/lxmert/tokenization_lxmert.py      |  2 +-
 .../models/mamba/configuration_mamba.py       |  2 +-
 .../models/mamba2/modeling_mamba2.py          |  4 +-
 .../marian/convert_marian_to_pytorch.py       |  2 +-
 .../models/marian/modeling_marian.py          |  2 +-
 .../mask2former/modeling_mask2former.py       | 10 ++---
 .../models/maskformer/modeling_maskformer.py  | 10 ++---
 .../maskformer/modeling_maskformer_swin.py    |  2 +-
 .../models/mistral/modeling_tf_mistral.py     |  2 +-
 .../models/mixtral/configuration_mixtral.py   |  2 +-
 .../mllama/convert_mllama_weights_to_hf.py    |  2 +-
 .../mobilebert/tokenization_mobilebert.py     |  2 +-
 .../models/modernbert/modeling_modernbert.py  |  2 +-
 .../models/modernbert/modular_modernbert.py   |  2 +-
 .../models/mpnet/tokenization_mpnet.py        |  2 +-
 src/transformers/models/mpt/modeling_mpt.py   |  2 +-
 .../mra/convert_mra_pytorch_to_pytorch.py     |  2 +-
 .../modeling_musicgen_melody.py               |  2 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |  2 +-
 .../models/olmoe/configuration_olmoe.py       |  2 +-
 .../oneformer/configuration_oneformer.py      |  2 +-
 .../models/oneformer/modeling_oneformer.py    | 12 ++---
 .../models/owlv2/image_processing_owlv2.py    |  2 +-
 .../models/owlv2/processing_owlv2.py          |  2 +-
 .../models/owlvit/processing_owlvit.py        |  2 +-
 .../models/patchtst/modeling_patchtst.py      |  2 +-
 .../convert_phi4_multimodal_weights_to_hf.py  |  2 +-
 .../feature_extraction_phi4_multimodal.py     |  4 +-
 .../modeling_phi4_multimodal.py               |  4 +-
 .../modular_phi4_multimodal.py                |  4 +-
 .../pop2piano/feature_extraction_pop2piano.py | 10 ++---
 .../pop2piano/tokenization_pop2piano.py       |  2 +-
 .../image_processing_prompt_depth_anything.py | 14 +++---
 .../prophetnet/tokenization_prophetnet.py     |  2 +-
 .../pvt_v2/convert_pvt_v2_to_pytorch.py       |  2 +-
 .../configuration_qwen2_5_omni.py             |  6 +--
 .../qwen2_5_omni/modeling_qwen2_5_omni.py     |  8 ++--
 .../qwen2_5_omni/modular_qwen2_5_omni.py      | 14 +++---
 .../qwen2_vl/image_processing_qwen2_vl.py     |  4 +-
 .../image_processing_qwen2_vl_fast.py         |  4 +-
 .../qwen3_moe/configuration_qwen3_moe.py      |  2 +-
 .../models/rag/configuration_rag.py           |  2 +-
 .../models/rag/modeling_tf_rag.py             |  2 +-
 .../regnet/convert_regnet_to_pytorch.py       |  2 +-
 .../models/regnet/modeling_flax_regnet.py     |  2 +-
 .../resnet/convert_resnet_to_pytorch.py       |  2 +-
 .../models/resnet/modeling_flax_resnet.py     |  2 +-
 .../models/roc_bert/tokenization_roc_bert.py  |  2 +-
 .../models/roformer/tokenization_roformer.py  |  2 +-
 .../models/rt_detr/modeling_rt_detr.py        |  2 +-
 .../models/rt_detr_v2/modeling_rt_detr_v2.py  |  2 +-
 .../models/rwkv/configuration_rwkv.py         |  2 +-
 .../models/sam/image_processing_sam.py        | 12 ++---
 src/transformers/models/sam/modeling_sam.py   |  6 +--
 .../models/sam/modeling_tf_sam.py             |  2 +-
 src/transformers/models/sam/processing_sam.py |  2 +-
 .../seamless_m4t/modeling_seamless_m4t.py     | 12 ++---
 .../modeling_seamless_m4t_v2.py               |  4 +-
 .../models/sew/configuration_sew.py           |  4 +-
 .../models/sew_d/configuration_sew_d.py       |  4 +-
 .../shieldgemma2/processing_shieldgemma2.py   |  4 +-
 .../configuration_speech_encoder_decoder.py   |  2 +-
 .../modeling_flax_speech_encoder_decoder.py   |  2 +-
 .../modeling_speech_encoder_decoder.py        |  2 +-
 .../speech_to_text/modeling_speech_to_text.py |  2 +-
 .../modeling_tf_speech_to_text.py             |  2 +-
 .../models/speecht5/configuration_speecht5.py |  4 +-
 .../models/splinter/tokenization_splinter.py  |  2 +-
 .../squeezebert/tokenization_squeezebert.py   |  2 +-
 .../modeling_table_transformer.py             |  6 +--
 .../models/tapas/tokenization_tapas.py        |  2 +-
 src/transformers/models/tvp/modeling_tvp.py   |  2 +-
 .../unispeech/configuration_unispeech.py      |  4 +-
 .../configuration_unispeech_sat.py            |  4 +-
 .../video_llava/processing_video_llava.py     |  2 +-
 .../modeling_flax_vision_encoder_decoder.py   |  2 +-
 .../modeling_tf_vision_encoder_decoder.py     |  2 +-
 .../modeling_vision_encoder_decoder.py        |  2 +-
 .../modeling_flax_vision_text_dual_encoder.py |  4 +-
 .../modeling_tf_vision_text_dual_encoder.py   |  4 +-
 .../modeling_vision_text_dual_encoder.py      |  4 +-
 .../models/vit_mae/modeling_tf_vit_mae.py     |  2 +-
 .../models/wav2vec2/configuration_wav2vec2.py |  4 +-
 .../models/wav2vec2/modeling_wav2vec2.py      |  2 +-
 .../configuration_wav2vec2_bert.py            |  4 +-
 .../convert_wav2vec2_seamless_checkpoint.py   |  2 +-
 .../wav2vec2_bert/modeling_wav2vec2_bert.py   |  2 +-
 .../wav2vec2_bert/modular_wav2vec2_bert.py    |  2 +-
 .../configuration_wav2vec2_conformer.py       |  4 +-
 .../modeling_wav2vec2_conformer.py            |  2 +-
 .../tokenization_wav2vec2_phoneme.py          |  2 +-
 .../models/whisper/configuration_whisper.py   |  6 +--
 .../whisper/feature_extraction_whisper.py     |  4 +-
 .../models/whisper/generation_whisper.py      | 10 ++---
 .../models/whisper/tokenization_whisper.py    |  6 +--
 .../whisper/tokenization_whisper_fast.py      |  4 +-
 .../models/x_clip/configuration_x_clip.py     |  2 +-
 .../models/yolos/modeling_yolos.py            |  6 +--
 .../yoso/convert_yoso_pytorch_to_pytorch.py   |  2 +-
 .../models/zamba/modeling_zamba.py            |  2 +-
 .../models/zamba2/modeling_zamba2.py          |  4 +-
 .../models/zamba2/modular_zamba2.py           |  2 +-
 .../zoedepth/image_processing_zoedepth.py     | 16 +++----
 .../pipelines/automatic_speech_recognition.py |  2 +-
 src/transformers/pipelines/mask_generation.py |  2 +-
 src/transformers/quantizers/base.py           |  2 +-
 .../quantizers/quantizer_torchao.py           |  2 +-
 src/transformers/trainer.py                   |  2 +-
 src/transformers/utils/import_utils.py        |  2 +-
 src/transformers/utils/quantization_config.py |  6 +--
 tests/models/bert/test_modeling_tf_bert.py    |  4 +-
 .../electra/test_modeling_tf_electra.py       |  4 +-
 tests/models/esm/test_modeling_tf_esm.py      |  2 +-
 .../rembert/test_modeling_tf_rembert.py       |  4 +-
 .../roberta/test_modeling_tf_roberta.py       |  4 +-
 .../test_modeling_tf_roberta_prelayernorm.py  |  4 +-
 tests/test_processing_common.py               |  2 +-
 232 files changed, 431 insertions(+), 425 deletions(-)

diff --git a/src/transformers/generation/streamers.py b/src/transformers/generation/streamers.py
index c0fc26442b..863fd67bdd 100644
--- a/src/transformers/generation/streamers.py
+++ b/src/transformers/generation/streamers.py
@@ -162,7 +162,7 @@ class TextStreamer(BaseStreamer):
 class TextIteratorStreamer(TextStreamer):
     """
     Streamer that stores print-ready text in a queue, to be used by a downstream application as an iterator. This is
-    useful for applications that benefit from acessing the generated text in a non-blocking way (e.g. in an interactive
+    useful for applications that benefit from accessing the generated text in a non-blocking way (e.g. in an interactive
     Gradio demo).
 
     <Tip warning={true}>
@@ -233,7 +233,7 @@ class TextIteratorStreamer(TextStreamer):
 class AsyncTextIteratorStreamer(TextStreamer):
     """
     Streamer that stores print-ready text in a queue, to be used by a downstream application as an async iterator.
-    This is useful for applications that benefit from acessing the generated text asynchronously (e.g. in an
+    This is useful for applications that benefit from accessing the generated text asynchronously (e.g. in an
     interactive Gradio demo).
 
     <Tip warning={true}>
diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py
index 344147e6e3..b2da6f18ba 100644
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@@ -2082,7 +2082,7 @@ class TFGenerationMixin:
 
         def gather_fn(tensor):
             if batch_axis > 0:
-                # pushes all dimentions before the batch to the end, so we get (batch, beam_id, ...)
+                # pushes all dimensions before the batch to the end, so we get (batch, beam_id, ...)
                 perm = tf.concat((tf.range(tf.rank(tensor))[batch_axis:], tf.range(batch_axis)), axis=0)
                 tensor = tf.transpose(tensor, perm=perm)
 
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index c0f6da8d76..995b556b7e 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1285,7 +1285,7 @@ class GenerationMixin:
         Merge user-defined processors/criteria with the ones instantiated inside `generate`. In case the same
         processor/criteria is present on both lists, use the user-defined one.
 
-        (Note: up to v4.49.0, this funtion threw an exception is the same logit processor was found twice.)
+        (Note: up to v4.49.0, this function threw an exception is the same logit processor was found twice.)
         """
         if len(custom_list) == 0:
             return default_list
@@ -3852,7 +3852,7 @@ class GenerationMixin:
 
         model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
 
-        # (joao) feature lost in the refactor. Probably won't implement, hurts readbility with minimal gains (there
+        # (joao) feature lost in the refactor. Probably won't implement, hurts readability with minimal gains (there
         # are newer low-memory alternatives like the offloaded cache)
         sequential = generation_config.low_memory
         if sequential:
diff --git a/src/transformers/generation/watermarking.py b/src/transformers/generation/watermarking.py
index e5f900c3b7..139d8cf2c7 100644
--- a/src/transformers/generation/watermarking.py
+++ b/src/transformers/generation/watermarking.py
@@ -538,7 +538,7 @@ class SynthIDTextWatermarkDetector:
         context_repetition_mask = self.logits_processor.compute_context_repetition_mask(
             input_ids=tokenized_outputs,
         )
-        # context repitition mask shape [batch_size, output_len - (ngram_len - 1)]
+        # context repetition mask shape [batch_size, output_len - (ngram_len - 1)]
 
         combined_mask = context_repetition_mask * eos_token_mask
 
diff --git a/src/transformers/integrations/aqlm.py b/src/transformers/integrations/aqlm.py
index 0626da7ace..cdbed289bd 100644
--- a/src/transformers/integrations/aqlm.py
+++ b/src/transformers/integrations/aqlm.py
@@ -30,7 +30,7 @@ def replace_with_aqlm_linear(
     """
     Public method that recursively replaces the Linear layers of the given model with AQLM quantized layers.
     `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the
-    conversion has been successfull or not.
+    conversion has been successful or not.
 
     Args:
         model (`torch.nn.Module`):
diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py
index c860ea1f53..23a418ead5 100644
--- a/src/transformers/integrations/awq.py
+++ b/src/transformers/integrations/awq.py
@@ -101,7 +101,7 @@ def replace_with_awq_linear(
     """
     Public method that recursively replaces the Linear layers of the given model with AWQ quantized layers.
     `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the
-    conversion has been successfull or not.
+    conversion has been successful or not.
 
     During the module replacement, we also infer the backend to use through the `quantization_config` object.
 
diff --git a/src/transformers/integrations/bitnet.py b/src/transformers/integrations/bitnet.py
index 0b50f9738a..aafca87856 100644
--- a/src/transformers/integrations/bitnet.py
+++ b/src/transformers/integrations/bitnet.py
@@ -201,7 +201,7 @@ def _replace_with_bitnet_linear(
     """
     Private method that wraps the recursion for module replacement.
 
-    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
+    Returns the converted model and a boolean that indicates if the conversion has been successful or not.
     """
 
     if current_key_name is None:
diff --git a/src/transformers/integrations/bitsandbytes.py b/src/transformers/integrations/bitsandbytes.py
index 3973dc58c0..7a6f6e107a 100644
--- a/src/transformers/integrations/bitsandbytes.py
+++ b/src/transformers/integrations/bitsandbytes.py
@@ -158,7 +158,7 @@ def _replace_with_bnb_linear(
     """
     Private method that wraps the recursion for module replacement.
 
-    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
+    Returns the converted model and a boolean that indicates if the conversion has been successful or not.
     """
     for name, module in model.named_children():
         if current_key_name is None:
@@ -280,7 +280,7 @@ def replace_8bit_linear(*args, **kwargs):
     return replace_with_bnb_linear(*args, **kwargs)
 
 
-# For backward compatiblity
+# For backward compatibility
 def set_module_8bit_tensor_to_device(*args, **kwargs):
     warnings.warn(
         "`set_module_8bit_tensor_to_device` will be deprecated in a future version, please use `set_module_quantized_tensor_to_device` instead",
@@ -403,7 +403,7 @@ def _dequantize_and_replace(
     some performance drop compared to the original model before quantization - use it only for specific usecases
     such as QLoRA adapters merging.
 
-    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
+    Returns the converted model and a boolean that indicates if the conversion has been successful or not.
     """
     quant_method = quantization_config.quantization_method()
 
diff --git a/src/transformers/integrations/eetq.py b/src/transformers/integrations/eetq.py
index 97698cf1aa..a3d124aa4b 100644
--- a/src/transformers/integrations/eetq.py
+++ b/src/transformers/integrations/eetq.py
@@ -36,7 +36,7 @@ def _replace_with_eetq_linear(
     """
     Private method that wraps the recursion for module replacement.
 
-    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
+    Returns the converted model and a boolean that indicates if the conversion has been successful or not.
     """
     if current_key_name is None:
         current_key_name = []
diff --git a/src/transformers/integrations/fbgemm_fp8.py b/src/transformers/integrations/fbgemm_fp8.py
index 1cc5a8b23a..ba4faa96c2 100644
--- a/src/transformers/integrations/fbgemm_fp8.py
+++ b/src/transformers/integrations/fbgemm_fp8.py
@@ -167,7 +167,7 @@ def _replace_with_fbgemm_fp8_linear(
     """
     Private method that wraps the recursion for module replacement.
 
-    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
+    Returns the converted model and a boolean that indicates if the conversion has been successful or not.
     """
 
     import re
@@ -196,7 +196,7 @@ def _replace_with_fbgemm_fp8_linear(
 
                     # Force requires grad to False to avoid unexpected errors
                     model._modules[name].requires_grad_(False)
-                # set non persistant buffer outside of init_empty_weights
+                # set non persistent buffer outside of init_empty_weights
                 model._modules[name].input_scale_ub = torch.tensor(
                     [quantization_config.activation_scale_ub],
                     dtype=torch.float,
diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py
index f1a25f2744..51bdc88608 100644
--- a/src/transformers/integrations/ggml.py
+++ b/src/transformers/integrations/ggml.py
@@ -424,7 +424,7 @@ class GGUFLlamaConverter(LlamaConverter):
         if post_processor:
             tokenizer.post_processor = post_processor
 
-        # HACK: patch the llama-3 tokenizer to use the correspinding pre-tokenizer
+        # HACK: patch the llama-3 tokenizer to use the corresponding pre-tokenizer
         # and normalizer
         if self.is_llama_3_tokenizer:
             tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
diff --git a/src/transformers/integrations/higgs.py b/src/transformers/integrations/higgs.py
index dd31764dfe..02c9a23dc6 100644
--- a/src/transformers/integrations/higgs.py
+++ b/src/transformers/integrations/higgs.py
@@ -558,7 +558,7 @@ def replace_with_higgs_linear(
     """
     Public method that recursively replaces the Linear layers of the given model with HIGGS quantized layers.
     `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the
-    conversion has been successfull or not.
+    conversion has been successful or not.
 
     Args:
         model (`torch.nn.Module`):
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index a696612c3b..f18eb43c78 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -1093,7 +1093,7 @@ class CometCallback(TrainerCallback):
             if state.is_hyper_param_search:
                 if mode is not None:
                     logger.warning(
-                        "Hyperparameter Search is enabled, forcing the creation of new experimetns, COMET_MODE value %r  is ignored",
+                        "Hyperparameter Search is enabled, forcing the creation of new experiments, COMET_MODE value %r  is ignored",
                         comet_old_mode,
                     )
                 mode = "create"
diff --git a/src/transformers/integrations/peft.py b/src/transformers/integrations/peft.py
index 6aa3b137b1..0c9402abe6 100644
--- a/src/transformers/integrations/peft.py
+++ b/src/transformers/integrations/peft.py
@@ -350,7 +350,7 @@ class PeftAdapterMixin:
 
         for _, module in self.named_modules():
             if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
-                # For backward compatbility with previous PEFT versions
+                # For backward compatibility with previous PEFT versions
                 if hasattr(module, "set_adapter"):
                     module.set_adapter(adapter_name)
                 else:
diff --git a/src/transformers/integrations/quanto.py b/src/transformers/integrations/quanto.py
index 1c57023219..c63b9b3b0c 100644
--- a/src/transformers/integrations/quanto.py
+++ b/src/transformers/integrations/quanto.py
@@ -30,7 +30,7 @@ def replace_with_quanto_layers(
 ):
     """
     Public method that recursively replaces the Linear layers of the given model with Quanto quantized layers.
-    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
+    Returns the converted model and a boolean that indicates if the conversion has been successful or not.
 
     Args:
         model (`torch.nn.Module`):
diff --git a/src/transformers/integrations/tensor_parallel.py b/src/transformers/integrations/tensor_parallel.py
index d1fef6d492..61bf29224d 100644
--- a/src/transformers/integrations/tensor_parallel.py
+++ b/src/transformers/integrations/tensor_parallel.py
@@ -611,14 +611,14 @@ def add_tensor_parallel_hooks_to_module(model, module, tp_plan, layer_name, curr
                 f"Trying to prepare {layer_name}, but it's not supported. Corresponding module: {module} Fix it's TP plan: {e}"
             )
 
-    # 2. We add hooks to the parrent module if needed
+    # 2. We add hooks to the parent module if needed
     if "." in layer_name:
-        parrent_layer_name = layer_name.rsplit(".", 1)[0]
-        generic_name = re.sub(r"\d+", "*", parrent_layer_name)
+        parent_layer_name = layer_name.rsplit(".", 1)[0]
+        generic_name = re.sub(r"\d+", "*", parent_layer_name)
         # The module itself needs hooks
         if module_plan := tp_plan.get(generic_name, False):
             tp_layer = translate_to_torch_parallel_style(module_plan)
-            module_to_tp_ = model.get_submodule(parrent_layer_name)
+            module_to_tp_ = model.get_submodule(parent_layer_name)
             tp_layer.prepare_module_tp(module_to_tp_, device_mesh)
 
 
diff --git a/src/transformers/integrations/vptq.py b/src/transformers/integrations/vptq.py
index aa435517e8..f76bd70377 100644
--- a/src/transformers/integrations/vptq.py
+++ b/src/transformers/integrations/vptq.py
@@ -28,7 +28,7 @@ def replace_with_vptq_linear(
     """
     Public method that recursively replaces the Linear layers of the given model with VPTQ quantized layers.
     `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the
-    conversion has been successfull or not.
+    conversion has been successful or not.
 
     Args:
         model (`torch.nn.Module`):
diff --git a/src/transformers/loss/loss_rt_detr.py b/src/transformers/loss/loss_rt_detr.py
index 88a4ac7cf4..a65d06de10 100644
--- a/src/transformers/loss/loss_rt_detr.py
+++ b/src/transformers/loss/loss_rt_detr.py
@@ -112,7 +112,7 @@ class RTDetrHungarianMatcher(nn.Module):
 
         # Compute the L1 cost between boxes
         bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
-        # Compute the giou cost betwen boxes
+        # Compute the giou cost between boxes
         giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
         # Compute the final cost matrix
         cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index e26801e106..5fbf51c297 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1395,7 +1395,7 @@ def _find_mismatched_keys(
         for key in new_state_dict.keys():
             if key in model_state_dict and new_state_dict[key].shape != model_state_dict[key].shape:
                 # This skips size mismatches for 4-bit weights. Two 4-bit values share an 8-bit container, causing size differences.
-                # Without matching with module type or paramter type it seems like a practical way to detect valid 4bit weights.
+                # Without matching with module type or parameter type it seems like a practical way to detect valid 4bit weights.
                 if not (
                     new_state_dict[key].shape[-1] == 1
                     and new_state_dict[key].numel() * 2 == model_state_dict[key].numel()
diff --git a/src/transformers/models/bamba/modeling_bamba.py b/src/transformers/models/bamba/modeling_bamba.py
index 074eea3aa9..a6220fbf92 100644
--- a/src/transformers/models/bamba/modeling_bamba.py
+++ b/src/transformers/models/bamba/modeling_bamba.py
@@ -463,7 +463,7 @@ class BambaMixer(nn.Module):
             projection_size,
             bias=self.use_bias,
         )
-        # selective projection used to make dt, B and C input dependant
+        # selective projection used to make dt, B and C input dependent
 
         # time step projection (discretization)
         # instantiate once and copy inv_dt in init_weights of PretrainedModel
@@ -1541,7 +1541,7 @@ class BambaForCausalLM(BambaPreTrainedModel, GenerationMixin):
         use_cache=True,
         **kwargs,
     ):
-        # Overwitten -- has a unique cache type, `HybridMambaAttentionDynamicCache`
+        # Overwritten -- has a unique cache type, `HybridMambaAttentionDynamicCache`
 
         empty_past_kv = past_key_values is None
 
diff --git a/src/transformers/models/bamba/modular_bamba.py b/src/transformers/models/bamba/modular_bamba.py
index dd0d0e62c6..3ee6e72711 100644
--- a/src/transformers/models/bamba/modular_bamba.py
+++ b/src/transformers/models/bamba/modular_bamba.py
@@ -260,7 +260,7 @@ class BambaMixer(nn.Module):
             projection_size,
             bias=self.use_bias,
         )
-        # selective projection used to make dt, B and C input dependant
+        # selective projection used to make dt, B and C input dependent
 
         # time step projection (discretization)
         # instantiate once and copy inv_dt in init_weights of PretrainedModel
@@ -1257,7 +1257,7 @@ class BambaForCausalLM(LlamaForCausalLM):
         use_cache=True,
         **kwargs,
     ):
-        # Overwitten -- has a unique cache type, `HybridMambaAttentionDynamicCache`
+        # Overwritten -- has a unique cache type, `HybridMambaAttentionDynamicCache`
 
         empty_past_kv = past_key_values is None
 
diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py
index 57a0c4e5a7..2357fd5375 100644
--- a/src/transformers/models/bark/modeling_bark.py
+++ b/src/transformers/models/bark/modeling_bark.py
@@ -1296,7 +1296,7 @@ class BarkFineModel(BarkPreTrainedModel):
     @add_start_docstrings_to_model_forward(BARK_FINE_INPUTS_DOCSTRING)
     def forward(
         self,
-        codebook_idx: int,  # an additionnal idx corresponding to the id of the codebook that will be predicted
+        codebook_idx: int,  # an additional idx corresponding to the id of the codebook that will be predicted
         input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
@@ -1547,7 +1547,7 @@ class BarkFineModel(BarkPreTrainedModel):
     - [`BarkSemanticModel`] (also referred to as the 'text' model): a causal auto-regressive transformer model that
       takes
     as input tokenized text, and predicts semantic text tokens that capture the meaning of the text.
-    - [`BarkCoarseModel`] (also refered to as the 'coarse acoustics' model), also a causal autoregressive transformer,
+    - [`BarkCoarseModel`] (also referred to as the 'coarse acoustics' model), also a causal autoregressive transformer,
     that takes into input the results of the last model. It aims at regressing the first two audio codebooks necessary
     to `encodec`.
     - [`BarkFineModel`] (the 'fine acoustics' model), this time a non-causal autoencoder transformer, which iteratively
@@ -1640,7 +1640,7 @@ class BarkModel(BarkPreTrainedModel):
             self.to("cpu")
             torch_accelerator_module.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
 
-        # this layer is used outside the first foward pass of semantic so need to be loaded before semantic
+        # this layer is used outside the first forward pass of semantic so need to be loaded before semantic
         self.semantic.input_embeds_layer, _ = cpu_offload_with_hook(self.semantic.input_embeds_layer, device)
 
         hook = None
diff --git a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
index cba1e1a2c3..f39ed47a19 100644
--- a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
+++ b/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
@@ -67,10 +67,10 @@ def convert_checkpoint_to_pytorch(tf_checkpoint_path: str, config_path: str, pyt
 
         return torch.from_numpy(array)
 
-    def get_encoder_attention_layer_array(layer_index: int, name: str, orginal_shape):
+    def get_encoder_attention_layer_array(layer_index: int, name: str, original_shape):
         full_name = f"encoder/_transformer_layers/{layer_index}/_attention_layer/{name}/.ATTRIBUTES/VARIABLE_VALUE"
         array = tf.train.load_variable(tf_checkpoint_path, full_name)
-        array = array.reshape(orginal_shape)
+        array = array.reshape(original_shape)
 
         if "kernel" in name:
             array = array.transpose()
diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py
index 42d4dd9455..141f10464a 100644
--- a/src/transformers/models/bert/tokenization_bert.py
+++ b/src/transformers/models/bert/tokenization_bert.py
@@ -460,7 +460,7 @@ class WordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
             text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
index 8a841a3091..3a2b6f46f8 100644
--- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
+++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
@@ -667,7 +667,7 @@ class CharacterTokenizer:
         """
         Tokenizes a piece of text into characters.
 
-        For example, `input = "apple""` wil return as output `["a", "p", "p", "l", "e"]`.
+        For example, `input = "apple""` will return as output `["a", "p", "p", "l", "e"]`.
 
         Args:
             text: A single token or whitespace separated tokens.
@@ -866,7 +866,7 @@ class WordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
             text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index abc5a1df44..3b250a056e 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -1171,7 +1171,7 @@ class BigBirdBlockSparseAttention(nn.Module):
             if plan_idx > 0:
                 # set the row for all from_blocks starting from 0 to
                 # plan_block_length[plan_idx-1]
-                # column indx start fromm plan_block_length[plan_idx-1] and ends at
+                # column indx start from plan_block_length[plan_idx-1] and ends at
                 # plan_block_length[plan_idx]
                 if plan_num_rand_blocks[plan_idx] > 0:
                     rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx]))
diff --git a/src/transformers/models/big_bird/modeling_flax_big_bird.py b/src/transformers/models/big_bird/modeling_flax_big_bird.py
index 7f43a4c5ab..e3bdfc38da 100644
--- a/src/transformers/models/big_bird/modeling_flax_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_flax_big_bird.py
@@ -1055,7 +1055,7 @@ class FlaxBigBirdBlockSparseAttention(nn.Module):
             from_block_size: int. size of block in from sequence.
             to_block_size: int. size of block in to sequence.
             num_heads: int. total number of heads.
-            plan_from_length: list. plan from length where num_random_blocks are choosen from.
+            plan_from_length: list. plan from length where num_random_blocks are chosen from.
             plan_num_rand_blocks: list. number of rand blocks within the plan.
             indices_prng_key: jax.random.PRNGKey. PRNG key that is used to perform random jax operations.
             deterministic: bool. When False random attention will be used.
@@ -1104,7 +1104,7 @@ class FlaxBigBirdBlockSparseAttention(nn.Module):
             if plan_idx > 0:
                 # set the row for all from_blocks starting from 0 to
                 # plan_block_length[plan_idx-1]
-                # column indx start fromm plan_block_length[plan_idx-1] and ends at
+                # column indx start from plan_block_length[plan_idx-1] and ends at
                 # plan_block_length[plan_idx]
                 if plan_num_rand_blocks[plan_idx] > 0:
                     rnd_r_cnt = int(sum(plan_num_rand_blocks[:plan_idx]))
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index 6827a4a188..ee81c6b3af 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -970,7 +970,7 @@ class BigBirdPegasusBlockSparseAttention(nn.Module):
             if plan_idx > 0:
                 # set the row for all from_blocks starting from 0 to
                 # plan_block_length[plan_idx-1]
-                # column indx start fromm plan_block_length[plan_idx-1] and ends at
+                # column indx start from plan_block_length[plan_idx-1] and ends at
                 # plan_block_length[plan_idx]
                 if plan_num_rand_blocks[plan_idx] > 0:
                     rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx]))
diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index 1f248ab8be..3b4d44afcd 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -64,7 +64,7 @@ class BlipForConditionalGenerationModelOutput(ModelOutput):
 
     Args:
         loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
-            Languge modeling loss from the text decoder.
+            Language modeling loss from the text decoder.
         logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
             Prediction scores of the language modeling head of the text decoder model.
         image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*):
@@ -109,7 +109,7 @@ class BlipTextVisionModelOutput(ModelOutput):
 
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Languge modeling loss from the text decoder.
+            Language modeling loss from the text decoder.
         image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
             The image embeddings obtained by applying the projection layer to the pooler_output.
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -145,7 +145,7 @@ class BlipImageTextMatchingModelOutput(ModelOutput):
         itm_score (`torch.FloatTensor`):
             The image-text similarity scores.
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Languge modeling loss from the text decoder.
+            Language modeling loss from the text decoder.
         image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
             The image embeddings obtained by applying the projection layer to the pooler_output.
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py
index 9573ca0fbb..36e8a2da7a 100644
--- a/src/transformers/models/blip/modeling_tf_blip.py
+++ b/src/transformers/models/blip/modeling_tf_blip.py
@@ -73,7 +73,7 @@ class TFBlipForConditionalGenerationModelOutput(ModelOutput):
 
     Args:
         loss (`tf.Tensor`, *optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`):
-            Languge modeling loss from the text decoder.
+            Language modeling loss from the text decoder.
         logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
             Prediction scores of the language modeling head of the text decoder model.
         image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)`, *optional*):
@@ -118,7 +118,7 @@ class TFBlipTextVisionModelOutput(ModelOutput):
 
     Args:
         loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Languge modeling loss from the text decoder.
+            Language modeling loss from the text decoder.
         image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
             The image embeddings obtained by applying the projection layer to the pooler_output.
         last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -154,7 +154,7 @@ class TFBlipImageTextMatchingModelOutput(ModelOutput):
         itm_score (`tf.Tensor`):
             The image-text similarity scores.
         loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Languge modeling loss from the text decoder.
+            Language modeling loss from the text decoder.
         image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
             The image embeddings obtained by applying the projection layer to the pooler_output.
         last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
diff --git a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py b/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
index 73d251875d..c4aa6f27c9 100644
--- a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
+++ b/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
@@ -116,12 +116,12 @@ def convert_bloom_checkpoint_to_pytorch(
                 else:
                     for key in tensors.keys():
                         if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH):
-                            # We average (sum and then divide) some weights accross TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425)
+                            # We average (sum and then divide) some weights across TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425)
                             tensors[key] += temp[key]
                         else:
                             # Some weights are RowParallelLinear in Megatron-Deepspeed, others are ColumnParallel
                             cat_dim = 1 if any(text in key for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0
-                            # We concatenate these weights accross TP ranks
+                            # We concatenate these weights across TP ranks
                             tensors[key] = torch.cat([tensors[key], temp[key]], dim=cat_dim)
 
             # Divide by the number of TP the weights we want to average
@@ -175,13 +175,13 @@ def convert_bloom_checkpoint_to_pytorch(
                     tensors = temp
                 else:
                     for key in tensors.keys():
-                        # We average (sum and then divide) some weights accross TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425)
+                        # We average (sum and then divide) some weights across TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425)
                         if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH):
                             tensors[key] += temp[key]
                         else:
                             # Some weights are RowParallelLinear in Megatron-Deepspeed, others are ColumnParallel
                             cat_dim = 1 if any(text in key for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0
-                            # We concatenate these weights accross TP ranks
+                            # We concatenate these weights across TP ranks
                             tensors[key] = torch.cat([tensors[key], temp[key]], dim=cat_dim)
 
             # Divide by the number of TP the weights we want to average
diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py
index 23cc569d49..76f95b5f8c 100644
--- a/src/transformers/models/camembert/tokenization_camembert.py
+++ b/src/transformers/models/camembert/tokenization_camembert.py
@@ -181,7 +181,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
 
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
-        # specifi to camembert, both 3 and 4 point to the unk token.
+        # specific to camembert, both 3 and 4 point to the unk token.
         if self.sp_model.PieceToId(token) == 0:
             # Convert sentence piece unk token to fairseq unk token index
             return self.unk_token_id
diff --git a/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py b/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py
index f74607f7b3..59b253b5ec 100644
--- a/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py
+++ b/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py
@@ -384,7 +384,7 @@ def write_model(model_path, input_base_path, model_size, chameleon_version=1):
         tokenizer_file=os.path.join(input_base_path, "tokenizer/text_tokenizer_modified.json"), legacy=False
     )
     tokenizer.sep_token_id = 8710  # assign <reserved08706> to sep so that we can append it after input text
-    tokenizer.pad_token_id = 1  # assing <pad> to special pad_token
+    tokenizer.pad_token_id = 1  # assign <pad> to special pad_token
     image_processor = ChameleonImageProcessor()
     processor = ChameleonProcessor(image_processor=image_processor, tokenizer=tokenizer)
     processor.save_pretrained(model_path)
diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py
index 9a7d43bdb5..b03336ce7e 100644
--- a/src/transformers/models/chameleon/modeling_chameleon.py
+++ b/src/transformers/models/chameleon/modeling_chameleon.py
@@ -124,7 +124,7 @@ class ChameleonLinearScalingRotaryEmbedding(ChameleonRotaryEmbedding):
     """ChameleonRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
 
     def forward(self, x, position_ids):
-        # difference to the original RoPE: a scaling factor is aplied to the position ids
+        # difference to the original RoPE: a scaling factor is applied to the position ids
         position_ids = position_ids.float() / self.scaling_factor
         cos, sin = super().forward(x, position_ids)
         return cos, sin
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index a7a51cc86a..4f89deed49 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -864,9 +864,9 @@ class ClapAudioEncoder(nn.Module):
         _, _, time_length, freq_length = normalized_input_features.shape
 
         spec_width = int(self.spec_size * self.freq_ratio)
-        spec_heigth = self.spec_size // self.freq_ratio
+        spec_height = self.spec_size // self.freq_ratio
 
-        if time_length > spec_width or freq_length > spec_heigth:
+        if time_length > spec_width or freq_length > spec_height:
             raise ValueError("the wav size should be less than or equal to the swin input size")
 
         # to avoid bicubic zero error
@@ -874,14 +874,14 @@ class ClapAudioEncoder(nn.Module):
             normalized_input_features = nn.functional.interpolate(
                 normalized_input_features, (spec_width, freq_length), mode="bicubic", align_corners=True
             )
-        if freq_length < spec_heigth:
+        if freq_length < spec_height:
             normalized_input_features = nn.functional.interpolate(
-                normalized_input_features, (time_length, spec_heigth), mode="bicubic", align_corners=True
+                normalized_input_features, (time_length, spec_height), mode="bicubic", align_corners=True
             )
 
         batch, channels, time, freq = normalized_input_features.shape
 
-        # batch_size, channels, spec_width, spec_heigth --> batch_size, channels, spec_heigth * freq_ratio, spec_width // freq_ratio
+        # batch_size, channels, spec_width, spec_height --> batch_size, channels, spec_height * freq_ratio, spec_width // freq_ratio
         normalized_input_features = normalized_input_features.reshape(
             batch, channels * self.freq_ratio, time // self.freq_ratio, freq
         )
diff --git a/src/transformers/models/clvp/feature_extraction_clvp.py b/src/transformers/models/clvp/feature_extraction_clvp.py
index ce6f5c782e..6a4965971c 100644
--- a/src/transformers/models/clvp/feature_extraction_clvp.py
+++ b/src/transformers/models/clvp/feature_extraction_clvp.py
@@ -49,9 +49,9 @@ class ClvpFeatureExtractor(SequenceFeatureExtractor):
             The default length of raw audio in seconds. If `max_length` is not set during `__call__` then it will
             automatically be set to default_audio_length * `self.sampling_rate`.
         hop_length (`int`, *optional*, defaults to 256):
-            Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
+            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
         chunk_length (`int`, *optional*, defaults to 30):
-            The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio
+            The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
             sequences.
         n_fft (`int`, *optional*, defaults to 1024):
             Size of the Fourier transform.
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 90b7b68bb4..13fde28d02 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -147,7 +147,7 @@ class ConditionalDetrObjectDetectionOutput(ModelOutput):
             possible padding). You can use [`~ConditionalDetrImageProcessor.post_process_object_detection`] to retrieve the
             unnormalized bounding boxes.
         auxiliary_outputs (`list[Dict]`, *optional*):
-            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
             `pred_boxes`) for each decoder layer.
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
@@ -1550,8 +1550,8 @@ class ConditionalDetrModel(ConditionalDetrPreTrainedModel):
         flattened_mask = mask.flatten(1)
 
         # Fourth, sent flattened_features + flattened_mask + object_queries through encoder
-        # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
-        # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+        # flattened_features is a Tensor of shape (batch_size, height*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, height*width)
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
                 inputs_embeds=flattened_features,
@@ -1908,8 +1908,8 @@ class ConditionalDetrForSegmentation(ConditionalDetrPreTrainedModel):
         flattened_mask = mask.flatten(1)
 
         # Fourth, sent flattened_features + flattened_mask + object_queries through encoder
-        # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
-        # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+        # flattened_features is a Tensor of shape (batch_size, height*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, height*width)
         if encoder_outputs is None:
             encoder_outputs = self.conditional_detr.model.encoder(
                 inputs_embeds=flattened_features,
@@ -2046,7 +2046,7 @@ class ConditionalDetrMaskHeadSmallConv(nn.Module):
                 nn.init.constant_(m.bias, 0)
 
     def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]):
-        # here we concatenate x, the projected feature map, of shape (batch_size, d_model, heigth/32, width/32) with
+        # here we concatenate x, the projected feature map, of shape (batch_size, d_model, height/32, width/32) with
         # the bbox_mask = the attention maps of shape (batch_size, n_queries, n_heads, height/32, width/32).
         # We expand the projected feature map to match the number of heads.
         x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1)
diff --git a/src/transformers/models/convbert/tokenization_convbert.py b/src/transformers/models/convbert/tokenization_convbert.py
index c2d6842838..1593e77ef4 100644
--- a/src/transformers/models/convbert/tokenization_convbert.py
+++ b/src/transformers/models/convbert/tokenization_convbert.py
@@ -465,7 +465,7 @@ class WordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
             text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/cpmant/modeling_cpmant.py b/src/transformers/models/cpmant/modeling_cpmant.py
index df0aebe3cb..a587a11f87 100755
--- a/src/transformers/models/cpmant/modeling_cpmant.py
+++ b/src/transformers/models/cpmant/modeling_cpmant.py
@@ -454,7 +454,7 @@ class CpmAntSegmentPositionEmbedding(nn.Module):
                 )
             if querylen != query_segment.size(1):
                 raise AssertionError(
-                    f"querylen should be equal to query_segment.size(1), but got {querylen} and {query_segment.szie(1)}!"
+                    f"querylen should be equal to query_segment.size(1), but got {querylen} and {query_segment.size(1)}!"
                 )
 
             key_pos = key_pos.view(batch, -1, keylen)
diff --git a/src/transformers/models/dab_detr/modeling_dab_detr.py b/src/transformers/models/dab_detr/modeling_dab_detr.py
index 84d6f276a8..3a60a07edf 100644
--- a/src/transformers/models/dab_detr/modeling_dab_detr.py
+++ b/src/transformers/models/dab_detr/modeling_dab_detr.py
@@ -143,7 +143,7 @@ class DabDetrObjectDetectionOutput(ModelOutput):
             possible padding). You can use [`~DabDetrImageProcessor.post_process_object_detection`] to retrieve the
             unnormalized bounding boxes.
         auxiliary_outputs (`list[Dict]`, *optional*):
-            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
             `pred_boxes`) for each decoder layer.
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
@@ -1415,8 +1415,8 @@ class DabDetrModel(DabDetrPreTrainedModel):
         reference_position_embeddings = self.query_refpoint_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1)
 
         # Fourth, sent flattened_features + flattened_mask + object_queries through encoder
-        # flattened_features is a Tensor of shape (heigth*width, batch_size, hidden_size)
-        # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+        # flattened_features is a Tensor of shape (height*width, batch_size, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, height*width)
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
                 inputs_embeds=flattened_features,
diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py
index c91b99e56c..bab37cc6c1 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_audio.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py
@@ -90,7 +90,7 @@ class Data2VecAudioConfig(PretrainedConfig):
             Number of groups of 1D convolutional positional embeddings layer.
         mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            procedure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
             reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
             masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
         mask_time_length (`int`, *optional*, defaults to 10):
@@ -101,7 +101,7 @@ class Data2VecAudioConfig(PretrainedConfig):
             mask_time_min_masks''
         mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            masking procedure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
             the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
             span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
             may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
diff --git a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
index 813fad89dc..9a41ed6fb0 100644
--- a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
@@ -1666,7 +1666,7 @@ class TFData2VecVisionForSemanticSegmentation(TFData2VecVisionPreTrainedModel):
             features[i + 2] = ops[i + 2](features[i + 2])
 
         logits = self.decode_head(features)
-        # Tranpose the logits to maintain consistency in the output formats.
+        # Transpose the logits to maintain consistency in the output formats.
         transposed_logits = tf.transpose(logits, perm=[0, 3, 1, 2])
 
         auxiliary_logits = None
diff --git a/src/transformers/models/decision_transformer/configuration_decision_transformer.py b/src/transformers/models/decision_transformer/configuration_decision_transformer.py
index e677206aa0..436834c7e5 100644
--- a/src/transformers/models/decision_transformer/configuration_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/configuration_decision_transformer.py
@@ -26,7 +26,7 @@ class DecisionTransformerConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`DecisionTransformerModel`]. It is used to
     instantiate a Decision Transformer model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the standard
-    DecisionTransformer architecture. Many of the config options are used to instatiate the GPT2 model that is used as
+    DecisionTransformer architecture. Many of the config options are used to instantiate the GPT2 model that is used as
     part of the architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
index 54000b8f24..22501ee508 100755
--- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
@@ -319,7 +319,7 @@ class DecisionTransformerGPT2Attention(nn.Module):
             else:
                 # Attention functions are consistent with previous equivalent attention classes, however they do not support some options
                 # (e.g. layer scaling, head mask) that eager supports. These implementations are thus equivalent to previous code, but
-                # not necessarily to eager (if mentionned options are provided).
+                # not necessarily to eager (if mentioned options are provided).
                 attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
         if using_eager and self.reorder_and_upcast_attn:
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index 4e177dde1a..a540bdc5d7 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -226,7 +226,7 @@ class DeformableDetrObjectDetectionOutput(ModelOutput):
             possible padding). You can use [`~DeformableDetrProcessor.post_process_object_detection`] to retrieve the
             unnormalized bounding boxes.
         auxiliary_outputs (`list[Dict]`, *optional*):
-            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
             `pred_boxes`) for each decoder layer.
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
@@ -1578,8 +1578,8 @@ class DeformableDetrModel(DeformableDetrPreTrainedModel):
 
             scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
             grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
-            width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level)
-            proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4)
+            width_height = torch.ones_like(grid) * 0.05 * (2.0**level)
+            proposal = torch.cat((grid, width_height), -1).view(batch_size, -1, 4)
             proposals.append(proposal)
             _cur += height * width
         output_proposals = torch.cat(proposals, 1)
diff --git a/src/transformers/models/deprecated/deta/modeling_deta.py b/src/transformers/models/deprecated/deta/modeling_deta.py
index 3e0b359947..ef4f0da573 100644
--- a/src/transformers/models/deprecated/deta/modeling_deta.py
+++ b/src/transformers/models/deprecated/deta/modeling_deta.py
@@ -267,7 +267,7 @@ class DetaObjectDetectionOutput(ModelOutput):
             possible padding). You can use [`~DetaProcessor.post_process_object_detection`] to retrieve the
             unnormalized bounding boxes.
         auxiliary_outputs (`list[Dict]`, *optional*):
-            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
             `pred_boxes`) for each decoder layer.
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
@@ -1570,8 +1570,8 @@ class DetaModel(DetaPreTrainedModel):
 
             scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
             grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
-            width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level)
-            proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4)
+            width_height = torch.ones_like(grid) * 0.05 * (2.0**level)
+            proposal = torch.cat((grid, width_height), -1).view(batch_size, -1, 4)
             proposals.append(proposal)
             _cur += height * width
             level_ids.append(grid.new_ones(height * width, dtype=torch.long) * level)
@@ -2293,7 +2293,7 @@ class DetaLoss(nn.Module):
         else:
             indices = self.matcher(outputs_without_aux, targets)
 
-        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        # Compute the average number of target boxes across all nodes, for normalization purposes
         num_boxes = sum(len(t["class_labels"]) for t in targets)
         num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
         # Check that we have initialized the distributed state
diff --git a/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
index 8ac9a13f5c..80f16881b5 100644
--- a/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
@@ -194,9 +194,9 @@ def convert_efficientformer_checkpoint(
     # Save Checkpoints
     Path(pytorch_dump_path).mkdir(exist_ok=True)
     model.save_pretrained(pytorch_dump_path)
-    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
+    print(f"Checkpoint successfully converted. Model saved at {pytorch_dump_path}")
     processor.save_pretrained(pytorch_dump_path)
-    print(f"Processor successfuly saved at {pytorch_dump_path}")
+    print(f"Processor successfully saved at {pytorch_dump_path}")
 
     if push_to_hub:
         print("Pushing model to the hub...")
diff --git a/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
index a84d000d44..8aa927d821 100644
--- a/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
@@ -47,7 +47,7 @@ def convert_tf_gptsan_to_pt(args):
                     player = int(key_name[9])
                 elif key_name.startswith("pasts/out"):
                     player = 8
-                name = "model.sqout.%d.weight" % (player * 2)  # enter to nn.Sequencial with Tanh, so 2 at a time
+                name = "model.sqout.%d.weight" % (player * 2)  # enter to nn.Sequential with Tanh, so 2 at a time
                 state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
                 new_state[name] = torch.tensor(state)
             elif key_name.startswith("model/moe"):
diff --git a/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py b/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py
index a35ea4a311..17da733be9 100644
--- a/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py
+++ b/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py
@@ -258,12 +258,12 @@ class GPTSanJapaneseSparseMLP(nn.Module):
         expert the corresponding hidden states.
 
         """
-        # Step 1: Get the router_mask from the router as wel as the probabilities
+        # Step 1: Get the router_mask from the router as well as the probabilities
         router_mask, router_probs, router_logits = self.router(hidden_states)
         expert_index = torch.argmax(router_mask, dim=-1)
 
         # The routers introduced might not always map all the tokens, to a router, which means that some hidden states
-        # can be unchanged from one layer to another. That is why the hidden states are cloned before updating only the seleced ones.
+        # can be unchanged from one layer to another. That is why the hidden states are cloned before updating only the selected ones.
 
         next_states = hidden_states.clone()
         for idx, expert in enumerate(self.experts.values()):
@@ -905,7 +905,7 @@ class GPTSanJapaneseModel(GPTSanJapanesePreTrainedModel):
 
         Returns:
             `MoEModelOutputWithPastAndCrossAttentions` or `tuple` if `return_dict` returns
-            MoEModelOutputWithPastAndCrossAttentions insted of tuple
+            MoEModelOutputWithPastAndCrossAttentions instead of tuple
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         device = self.position_embeddings.weight.device
@@ -1006,7 +1006,7 @@ class GPTSanJapaneseModel(GPTSanJapanesePreTrainedModel):
         if token_type_ids is not None:
             token_type_ids = token_type_ids.unsqueeze(1).unsqueeze(2)
             prefix_lm_mask = ((prefix_lm_mask + token_type_ids) > 0).float()
-        # Marge prefix_lm_mask and attention_mask
+        # Merge prefix_lm_mask and attention_mask
         extended_attention_mask = prefix_lm_mask * attention_mask.unsqueeze(1).unsqueeze(2)
 
         # Prepare head mask if needed
@@ -1130,7 +1130,7 @@ class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel):
             labels in `[0, ..., config.vocab_size]`
 
         Returns:
-            `MoECausalLMOutputWithPast` or `tuple` if `return_dict` returns MoECausalLMOutputWithPast insted of tuple
+            `MoECausalLMOutputWithPast` or `tuple` if `return_dict` returns MoECausalLMOutputWithPast instead of tuple
 
         Example:
 
diff --git a/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py
index c93ea87278..9ffe4d9b14 100644
--- a/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py
+++ b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py
@@ -125,7 +125,7 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
         emoji_file (`str`):
             File containing the emoji.
         unk_token (`str`, *optional*, defaults to `"<|nottoken|>"`):
-            The token used for unknown charactor
+            The token used for unknown character
         pad_token (`str`, *optional*, defaults to `"<|separator|>"`):
             The token used for padding
         bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
@@ -372,7 +372,7 @@ class SubWordJapaneseTokenizer:
     - Decoding byte0~byte255 tokens correctly
     - Added bagofword token handling
 
-    https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT Lisence according to the
+    https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT License according to the
     original repository.
 
     MIT License
diff --git a/src/transformers/models/deprecated/graphormer/modeling_graphormer.py b/src/transformers/models/deprecated/graphormer/modeling_graphormer.py
index 1253d1365e..7b91a429b0 100755
--- a/src/transformers/models/deprecated/graphormer/modeling_graphormer.py
+++ b/src/transformers/models/deprecated/graphormer/modeling_graphormer.py
@@ -740,7 +740,7 @@ class GraphormerPreTrainedModel(PreTrainedModel):
         Initialize the weights
         """
         if isinstance(module, (nn.Linear, nn.Conv2d)):
-            # We might be missing part of the Linear init, dependant on the layer num
+            # We might be missing part of the Linear init, dependent on the layer num
             module.weight.data.normal_(mean=0.0, std=0.02)
             if module.bias is not None:
                 module.bias.data.zero_()
diff --git a/src/transformers/models/deprecated/jukebox/convert_jukebox.py b/src/transformers/models/deprecated/jukebox/convert_jukebox.py
index aac3b2efe7..3380e38693 100644
--- a/src/transformers/models/deprecated/jukebox/convert_jukebox.py
+++ b/src/transformers/models/deprecated/jukebox/convert_jukebox.py
@@ -197,7 +197,7 @@ def fix_jukebox_keys(state_dict, model_state_dict, key_prefix, mapping):
         if f"{key_prefix}.{key}" not in model_state_dict or key is None:
             print(f"failed converting {original_key} to {key}, does not match")
 
-        # handle missmatched shape
+        # handle mismatched shape
         elif value.shape != model_state_dict[f"{key_prefix}.{key}"].shape:
             val = model_state_dict[f"{key_prefix}.{key}"]
             print(f"{original_key}-> {key} : \nshape {val.shape} and {value.shape}, do not match")
diff --git a/src/transformers/models/deprecated/jukebox/modeling_jukebox.py b/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
index 566148ceda..3bff1d83ba 100755
--- a/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
@@ -1308,11 +1308,11 @@ class JukeboxConditionalAutoregressive(nn.Module):
                 Number of tokens or lyrics tokens provided in a single pass.
             embed_dim (`int`, *optional*):
                 Either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codebook dimension,
-                if the model combines lyrics and music tokens, or simply n_vocab if the model is a seperate encoder
+                if the model combines lyrics and music tokens, or simply n_vocab if the model is a separate encoder
             audio_conditioning (`bool`, *optional*, defaults to `False`):
-                Whether or not the prior supports conditionning on audio.
+                Whether or not the prior supports conditioning on audio.
             metadata_conditioning (`bool`, *optional*, defaults to `False`):
-                Whether or not the prior supports conditionning on artitst, genres, lyrics and timing.
+                Whether or not the prior supports conditioning on artitst, genres, lyrics and timing.
             is_encoder (`bool`, *optional*, defaults to `False`):
                 Whether the model is an encoder only model.
         """
@@ -1392,7 +1392,7 @@ class JukeboxConditionalAutoregressive(nn.Module):
         hidden_states = self.transformer(
             hidden_states, last_encoder_hidden_states=last_encoder_hidden_states
         )  # Transformer
-        if self.add_cond_after_transformer:  # Piped doesnt add x_cond
+        if self.add_cond_after_transformer:  # Piped doesn't add x_cond
             hidden_states = hidden_states + audio_conditioning
 
         activations = hidden_states
@@ -1535,7 +1535,7 @@ class JukeboxConditionalAutoregressive(nn.Module):
             if get_preds:
                 preds = []
 
-            # Fill up key/value cache for past context by runing forward pass.
+            # Fill up key/value cache for past context by running forward pass.
             # We do so in chunks instead of doing the whole past in one forward pass to reduce max memory usage.
             if chunk_size is None:
                 chunk_size = len(sampled_audio)
@@ -1617,7 +1617,7 @@ class JukeboxConditionalAutoregressive(nn.Module):
 
 class JukeboxMusicTokenConditioner(nn.Module):
     """
-    The `JukeboxMusicTokenConditioner` takes music tokens as an input (coresponding to the codes of the VQVAE's
+    The `JukeboxMusicTokenConditioner` takes music tokens as an input (corresponding to the codes of the VQVAE's
     codebook) and upsamples it using a single layer of decoder convolution block (the same is used in the VQVAE).
     """
 
@@ -1637,20 +1637,20 @@ class JukeboxMusicTokenConditioner(nn.Module):
         )
         self.layer_norm = JukeboxLayerNorm(config.hidden_size)
 
-    def forward(self, music_tokens, raw_audio_conditionning=None):
+    def forward(self, music_tokens, raw_audio_conditioning=None):
         """
         Args:
             music_tokens (`torch.LongTensor`):
-                Music tokens form the uper level in range(nb_discrete_codes)
-            raw_audio_conditionning (`torch.LongTensor`, *optional*):
+                Music tokens form the upper level in range(nb_discrete_codes)
+            raw_audio_conditioning (`torch.LongTensor`, *optional*):
                 Audio used when primed sampling, raw audio information that conditions the generation
         """
-        if raw_audio_conditionning is None:
-            raw_audio_conditionning = 0.0
+        if raw_audio_conditioning is None:
+            raw_audio_conditioning = 0.0
         # Embed music_tokens
         music_tokens = music_tokens.long()
         hidden_states = self.embed_tokens(music_tokens)
-        hidden_states = hidden_states + raw_audio_conditionning
+        hidden_states = hidden_states + raw_audio_conditioning
 
         # Run conditioner
         hidden_states = hidden_states.permute(0, 2, 1)
@@ -1768,7 +1768,7 @@ class JukeboxPrior(PreTrainedModel):
     """
     The JukeboxPrior class, which is a wrapper around the various conditioning and the transformer. JukeboxPrior can be
     seen as language models trained on music. They model the next `music token` prediction task. If a (lyric) `encoderù
-    is defined, it also models the `next character` prediction on the lyrics. Can be conditionned on timing, artist,
+    is defined, it also models the `next character` prediction on the lyrics. Can be conditioned on timing, artist,
     genre, lyrics and codes from lower-levels Priors.
 
     Args:
@@ -1809,7 +1809,7 @@ class JukeboxPrior(PreTrainedModel):
         elif isinstance(module, JukeboxConditionalAutoregressive) and hasattr(module, "start_token"):
             module.start_token.data.normal_(mean=0.0, std=0.01 * init_scale)
         elif isinstance(module, JukeboxResConv1DBlock) and self.config.zero_out:
-            module.conv1d_2.weigth.data.zero_()
+            module.conv1d_2.weight.data.zero_()
             module.conv1d_2.bias.data.zero_()
         if isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
@@ -1931,7 +1931,7 @@ class JukeboxPrior(PreTrainedModel):
             tokens_list = torch.zeros(
                 (labels.shape[0], self.nb_relevant_lyric_tokens), dtype=torch.long, device=labels.device
             )
-            indices_list = []  # whats the index of each current character in original array
+            indices_list = []  # what's the index of each current character in original array
             for idx in range(labels.shape[0]):
                 full_tokens = labels.clone()[:, 4 + self.metadata_embedding.max_nb_genres :]
                 total_length, offset, duration = labels[idx, 0], labels[idx, 1], labels[idx, 2]
@@ -2073,12 +2073,12 @@ class JukeboxPrior(PreTrainedModel):
             n_samples (`int`):
                 Number of samples to generate.
             music_tokens (`List[torch.LongTensor]`, *optional*):
-                Previously gemerated tokens at the current level. Used as context for the generation.
+                Previously generated tokens at the current level. Used as context for the generation.
             music_tokens_conds (`List[torch.FloatTensor]`, *optional*):
                 Upper-level music tokens generated by the previous prior model. Is `None` if the generation is not
-                conditionned on the upper-level tokens.
+                conditioned on the upper-level tokens.
             metadata (`List[torch.LongTensor]`, *optional*):
-                List containing the metatdata tensor with the artist, genre and the lyric tokens.
+                List containing the metadata tensor with the artist, genre and the lyric tokens.
             temp (`float`, *optional*, defaults to 1.0):
                 Sampling temperature.
             top_k (`int`, *optional*, defaults to 0):
@@ -2237,11 +2237,11 @@ class JukeboxPrior(PreTrainedModel):
             hidden_states (`torch.Tensor`):
                 Hidden states which should be raw audio
             metadata (`List[torch.LongTensor]`, *optional*):
-                List containing the metadata conditioning tensorwith the lyric and the metadata tokens.
+                List containing the metadata conditioning tensor with the lyric and the metadata tokens.
             decode (`bool`, *optional*, defaults to `False`):
                 Whether or not to decode the encoded to tokens.
             get_preds (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the actual predicitons of the model.
+                Whether or not to return the actual predictions of the model.
         """
         batch_size = hidden_states.shape[0]
         music_tokens, *music_tokens_conds = self.encode(hidden_states, bs_chunks=batch_size)
@@ -2466,10 +2466,10 @@ class JukeboxModel(JukeboxPreTrainedModel):
             metas (`List[Any]`, *optional*):
                 Metadatas used to generate the `labels`
             chunk_size (`int`, *optional*, defaults to 32):
-                Size of a chunk of audio, used to fill up the memory in chuncks to prevent OOM erros. Bigger chunks
+                Size of a chunk of audio, used to fill up the memory in chunks to prevent OOM errors. Bigger chunks
                 means faster memory filling but more consumption.
             sampling_temperature (`float`, *optional*, defaults to 0.98):
-                Temperature used to ajust the randomness of the sampling.
+                Temperature used to adjust the randomness of the sampling.
             lower_batch_size (`int`, *optional*, defaults to 16):
                 Maximum batch size for the lower level priors
             max_batch_size (`int`, *optional*, defaults to 16):
diff --git a/src/transformers/models/deprecated/mctct/modeling_mctct.py b/src/transformers/models/deprecated/mctct/modeling_mctct.py
index 2dd074b28c..e873111cb2 100755
--- a/src/transformers/models/deprecated/mctct/modeling_mctct.py
+++ b/src/transformers/models/deprecated/mctct/modeling_mctct.py
@@ -467,7 +467,7 @@ class MCTCTPreTrainedModel(PreTrainedModel):
 
     def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
         # generate creates 3D attention mask, because of the shape of input_features
-        # convert it to 2D if thats the case
+        # convert it to 2D if that's the case
         if len(attention_mask.shape) > 2:
             attention_mask = attention_mask[:, :, -1]
 
diff --git a/src/transformers/models/deprecated/mega/modeling_mega.py b/src/transformers/models/deprecated/mega/modeling_mega.py
index d5a490b01d..85d1015610 100644
--- a/src/transformers/models/deprecated/mega/modeling_mega.py
+++ b/src/transformers/models/deprecated/mega/modeling_mega.py
@@ -463,7 +463,7 @@ class MegaMultiDimensionDampedEma(nn.Module):
             prev_state (`torch.Tensor` of shape `(batch_size, config.ndim)`, *optional*):
                 The hidden state returned from the previous timestep during incremental decoding.
             use_cache (`bool`, default `False`):
-                Whether to perfom incremental decoding; uses `prev_state` as the prior timestep, and returns the
+                Whether to perform incremental decoding; uses `prev_state` as the prior timestep, and returns the
                 updated EMA hidden state for use in the next step
 
         Returns:
@@ -652,7 +652,7 @@ class MegaGatedCrossAttention(nn.Module):
             output_attentions (`bool`, defaults to `False`):
                 Whether or not to return the cross-attention weights.
             use_cache (`bool`, defaults to `False`):
-                Whether to perfom incremental decoding; uses `prev_state` as the prior timestep, and returns the
+                Whether to perform incremental decoding; uses `prev_state` as the prior timestep, and returns the
                 updated EMA hidden state for use in the next step
 
         Returns:
@@ -936,7 +936,7 @@ class MegaMovingAverageGatedAttention(nn.Module):
             output_attentions (`bool`, default `False`):
                 Whether to return self-attention weights
             use_cache (`bool`, default `False`):
-                Whether to perfom incremental decoding; uses `past_key_values` as prior state, and returns the updated
+                Whether to perform incremental decoding; uses `past_key_values` as prior state, and returns the updated
                 states for use in the next step
 
         Returns:
@@ -1214,7 +1214,7 @@ class MegaBlock(nn.Module):
             output_attentions (`bool`, default `False`):
                 Whether to return self-attention weights
             use_cache (`bool`, default `False`):
-                Whether to perfom incremental decoding; uses `past_key_value` as prior state, and returns the updated
+                Whether to perform incremental decoding; uses `past_key_value` as prior state, and returns the updated
                 states for use in the next step
 
         Returns:
diff --git a/src/transformers/models/deprecated/realm/retrieval_realm.py b/src/transformers/models/deprecated/realm/retrieval_realm.py
index b3c084f1d2..b5e47abb11 100644
--- a/src/transformers/models/deprecated/realm/retrieval_realm.py
+++ b/src/transformers/models/deprecated/realm/retrieval_realm.py
@@ -76,7 +76,7 @@ class RealmRetriever:
 
         Parameters:
             block_records (`np.ndarray`):
-                A numpy array which cantains evidence texts.
+                A numpy array which contains evidence texts.
             tokenizer ([`RealmTokenizer`]):
                 The tokenizer to encode retrieved texts.
     """
diff --git a/src/transformers/models/deprecated/realm/tokenization_realm.py b/src/transformers/models/deprecated/realm/tokenization_realm.py
index 70e69bc4bc..5c3c7a196f 100644
--- a/src/transformers/models/deprecated/realm/tokenization_realm.py
+++ b/src/transformers/models/deprecated/realm/tokenization_realm.py
@@ -516,7 +516,7 @@ class WordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
             text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/deprecated/retribert/tokenization_retribert.py b/src/transformers/models/deprecated/retribert/tokenization_retribert.py
index 35a1874aa0..7f2206f0d9 100644
--- a/src/transformers/models/deprecated/retribert/tokenization_retribert.py
+++ b/src/transformers/models/deprecated/retribert/tokenization_retribert.py
@@ -457,7 +457,7 @@ class WordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
             text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py
index 23972deae2..7d82659b5a 100644
--- a/src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py
+++ b/src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py
@@ -49,7 +49,7 @@ class TransfoXLConfig(PretrainedConfig):
         d_inner (`int`, *optional*, defaults to 4096):
             Inner dimension in FF
         div_val (`int`, *optional*, defaults to 4):
-            Divident value for adapative input and softmax
+            Divident value for adaptive input and softmax
         pre_lnorm (`boolean`, *optional*, defaults to `False`):
             Whether or not to apply LayerNorm to the input instead of the output in the blocks.
         n_layer (`int`, *optional*, defaults to 18):
diff --git a/src/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py b/src/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py
index bbbfac9031..6e1c49d708 100644
--- a/src/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py
+++ b/src/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py
@@ -46,7 +46,7 @@ class TvltFeatureExtractor(SequenceFeatureExtractor):
         sampling_rate (`int`, *optional*, defaults to 44100):
             The sampling rate at which the audio files should be digitalized expressed in Hertz (Hz).
         hop_length_to_sampling_rate (`int`, *optional*, defaults to 86):
-            Hop length is length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
+            Hop length is length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
             For example, with sampling rate 44100, the hop length is 512, with 44100 / 512 = 86
         n_fft (`int`, *optional*, defaults to 2048):
             Size of the Fourier transform.
@@ -141,7 +141,7 @@ class TvltFeatureExtractor(SequenceFeatureExtractor):
 
                 <Tip>
 
-                For TvltTransformer models, `attention_mask` should alwys be passed for batched inference, to avoid
+                For TvltTransformer models, `attention_mask` should always be passed for batched inference, to avoid
                 subtle bugs.
 
                 </Tip>
diff --git a/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py b/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py
index 624e52cedd..02d78c9340 100644
--- a/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py
+++ b/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py
@@ -356,10 +356,10 @@ class TvltImageProcessor(BaseImageProcessor):
 
             - **pixel_mask** -- Pixel masks to be fed to a model, of shape (batch_size, num_pixel_patches).
 
-            - **pixel_values_mixed** -- Pixel values with both postive or negative to be fed to a model, of shape
+            - **pixel_values_mixed** -- Pixel values with both positive or negative to be fed to a model, of shape
               (batch_size, num_channels, height, width).
 
-            - **pixel_mask_mixed** -- Pixel masks with both postive or negative to be fed to a model, of shape
+            - **pixel_mask_mixed** -- Pixel masks with both positive or negative to be fed to a model, of shape
               (batch_size, num_pixel_patches).
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
diff --git a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
index cd87217f05..b509d60d12 100644
--- a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
+++ b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
@@ -91,7 +91,7 @@ class ModuleTransfer:
         for dest_m, src_m in zip(dest_traced, src_traced):
             dest_m.load_state_dict(src_m.state_dict())
             if self.verbose == 1:
-                print(f"Transfered from={src_m} to={dest_m}")
+                print(f"Transferred from={src_m} to={dest_m}")
 
 
 def copy_parameters(from_model: nn.Module, our_model: nn.Module) -> nn.Module:
diff --git a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py b/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py
index d43ff7f40d..f07a76b2b2 100644
--- a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py
+++ b/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py
@@ -87,7 +87,7 @@ def create_rename_keys(config):
     rename_keys.append(("pretrained.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
     rename_keys.append(("pretrained.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
 
-    # Transfomer encoder
+    # Transformer encoder
     for i in range(config.backbone_config.num_hidden_layers):
         rename_keys.append((f"pretrained.blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1"))
         rename_keys.append((f"pretrained.blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1"))
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index c26bf484f5..d1aa64d2f6 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -902,14 +902,14 @@ class DepthProFeatureFusionStage(nn.Module):
         for _ in range(self.num_layers - 1):
             self.intermediate.append(DepthProFeatureFusionLayer(config))
 
-        # final layer doesnot require deconvolution
+        # final layer does not require deconvolution
         self.final = DepthProFeatureFusionLayer(config, use_deconv=False)
 
     def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]:
         if self.num_layers != len(hidden_states):
             raise ValueError(
                 f"num_layers={self.num_layers} in DepthProFeatureFusionStage"
-                f"doesnot match len(hidden_states)={len(hidden_states)}"
+                f"does not match len(hidden_states)={len(hidden_states)}"
             )
 
         fused_hidden_states = []
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index cb47f58bda..989d68e75d 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -138,7 +138,7 @@ class DetrObjectDetectionOutput(ModelOutput):
             possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the
             unnormalized bounding boxes.
         auxiliary_outputs (`list[Dict]`, *optional*):
-            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
             `pred_boxes`) for each decoder layer.
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
@@ -1278,8 +1278,8 @@ class DetrModel(DetrPreTrainedModel):
         flattened_mask = mask.flatten(1)
 
         # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
-        # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
-        # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+        # flattened_features is a Tensor of shape (batch_size, height*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, height*width)
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
                 inputs_embeds=flattened_features,
@@ -1603,8 +1603,8 @@ class DetrForSegmentation(DetrPreTrainedModel):
         flattened_mask = mask.flatten(1)
 
         # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
-        # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
-        # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+        # flattened_features is a Tensor of shape (batch_size, height*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, height*width)
         if encoder_outputs is None:
             encoder_outputs = self.detr.model.encoder(
                 inputs_embeds=flattened_features,
@@ -1739,7 +1739,7 @@ class DetrMaskHeadSmallConv(nn.Module):
                 nn.init.constant_(m.bias, 0)
 
     def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]):
-        # here we concatenate x, the projected feature map, of shape (batch_size, d_model, heigth/32, width/32) with
+        # here we concatenate x, the projected feature map, of shape (batch_size, d_model, height/32, width/32) with
         # the bbox_mask = the attention maps of shape (batch_size, n_queries, n_heads, height/32, width/32).
         # We expand the projected feature map to match the number of heads.
         x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1)
diff --git a/src/transformers/models/distilbert/tokenization_distilbert.py b/src/transformers/models/distilbert/tokenization_distilbert.py
index c894211a2e..e5f1a20ae5 100644
--- a/src/transformers/models/distilbert/tokenization_distilbert.py
+++ b/src/transformers/models/distilbert/tokenization_distilbert.py
@@ -475,7 +475,7 @@ class WordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
             text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py b/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py
index 367aff7f90..21aa2b4897 100644
--- a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py
+++ b/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py
@@ -123,7 +123,7 @@ def create_rename_keys_backbone(config):
     rename_keys.append(("patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
     rename_keys.append(("patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
 
-    # Transfomer encoder
+    # Transformer encoder
     for i in range(config.backbone_config.num_hidden_layers):
         # layernorms
         rename_keys.append((f"blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight"))
diff --git a/src/transformers/models/dpt/convert_dpt_beit_to_hf.py b/src/transformers/models/dpt/convert_dpt_beit_to_hf.py
index 3a576d772f..c4ff8a3eb7 100644
--- a/src/transformers/models/dpt/convert_dpt_beit_to_hf.py
+++ b/src/transformers/models/dpt/convert_dpt_beit_to_hf.py
@@ -77,7 +77,7 @@ def create_rename_keys(config):
     rename_keys.append(("pretrained.model.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
     rename_keys.append(("pretrained.model.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
 
-    # Transfomer encoder
+    # Transformer encoder
     for i in range(config.backbone_config.num_hidden_layers):
         rename_keys.append((f"pretrained.model.blocks.{i}.gamma_1", f"backbone.encoder.layer.{i}.lambda_1"))
         rename_keys.append((f"pretrained.model.blocks.{i}.gamma_2", f"backbone.encoder.layer.{i}.lambda_2"))
diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py
index 095cd1a48b..9a35ee4b4a 100644
--- a/src/transformers/models/dpt/image_processing_dpt.py
+++ b/src/transformers/models/dpt/image_processing_dpt.py
@@ -111,22 +111,22 @@ class DPTImageProcessor(BaseImageProcessor):
 
     Args:
         do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image's (height, width) dimensions. Can be overidden by `do_resize` in `preprocess`.
+            Whether to resize the image's (height, width) dimensions. Can be overridden by `do_resize` in `preprocess`.
         size (`Dict[str, int]` *optional*, defaults to `{"height": 384, "width": 384}`):
-            Size of the image after resizing. Can be overidden by `size` in `preprocess`.
+            Size of the image after resizing. Can be overridden by `size` in `preprocess`.
         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
-            Defines the resampling filter to use if resizing the image. Can be overidden by `resample` in `preprocess`.
+            Defines the resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
         keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
             If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
-            be overidden by `keep_aspect_ratio` in `preprocess`.
+            be overridden by `keep_aspect_ratio` in `preprocess`.
         ensure_multiple_of (`int`, *optional*, defaults to 1):
-            If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overidden
+            If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
             by `ensure_multiple_of` in `preprocess`.
         do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overidden by `do_rescale` in
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
             `preprocess`.
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Scale factor to use if rescaling the image. Can be overidden by `rescale_factor` in `preprocess`.
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in `preprocess`.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
             method.
diff --git a/src/transformers/models/efficientnet/modeling_efficientnet.py b/src/transformers/models/efficientnet/modeling_efficientnet.py
index 9e0b890729..7943b5d117 100644
--- a/src/transformers/models/efficientnet/modeling_efficientnet.py
+++ b/src/transformers/models/efficientnet/modeling_efficientnet.py
@@ -383,7 +383,7 @@ class EfficientNetBlock(nn.Module):
 
 class EfficientNetEncoder(nn.Module):
     r"""
-    Forward propogates the embeddings through each EfficientNet block.
+    Forward propagates the embeddings through each EfficientNet block.
 
     Args:
         config ([`EfficientNetConfig`]):
diff --git a/src/transformers/models/electra/modeling_flax_electra.py b/src/transformers/models/electra/modeling_flax_electra.py
index 77a445e6cc..4bf75ff33e 100644
--- a/src/transformers/models/electra/modeling_flax_electra.py
+++ b/src/transformers/models/electra/modeling_flax_electra.py
@@ -1237,7 +1237,7 @@ class FlaxElectraSequenceSummary(nn.Module):
         Returns:
             `jnp.ndarray`: The summary of the sequence hidden states.
         """
-        # NOTE: this doest "first" type summary always
+        # NOTE: this does "first" type summary always
         output = hidden_states[:, 0]
         output = self.first_dropout(output, deterministic=deterministic)
         output = self.summary(output)
diff --git a/src/transformers/models/electra/tokenization_electra.py b/src/transformers/models/electra/tokenization_electra.py
index 3b21527e6c..365274d7ed 100644
--- a/src/transformers/models/electra/tokenization_electra.py
+++ b/src/transformers/models/electra/tokenization_electra.py
@@ -464,7 +464,7 @@ class WordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
             text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py
index f5a626a21f..d2d0b56a8d 100644
--- a/src/transformers/models/emu3/modeling_emu3.py
+++ b/src/transformers/models/emu3/modeling_emu3.py
@@ -1780,7 +1780,7 @@ EMU3_INPUTS_DOCSTRING = r"""
 
 class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["text_model.lm_head.weight"]
-    _supports_static_cache = False  # `get_image_tokens()`, called when `pixel_values` is passed, is not compileable
+    _supports_static_cache = False  # `get_image_tokens()`, called when `pixel_values` is passed, is not compilable
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/emu3/modular_emu3.py b/src/transformers/models/emu3/modular_emu3.py
index c4e35e71d2..52d32dbdee 100644
--- a/src/transformers/models/emu3/modular_emu3.py
+++ b/src/transformers/models/emu3/modular_emu3.py
@@ -1123,7 +1123,7 @@ class Emu3ForCausalLM(LlamaForCausalLM, Emu3PreTrainedModel, GenerationMixin):
 
 class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["text_model.lm_head.weight"]
-    _supports_static_cache = False  # `get_image_tokens()`, called when `pixel_values` is passed, is not compileable
+    _supports_static_cache = False  # `get_image_tokens()`, called when `pixel_values` is passed, is not compilable
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/encodec/configuration_encodec.py b/src/transformers/models/encodec/configuration_encodec.py
index 77fd67727d..c00b5ebd1c 100644
--- a/src/transformers/models/encodec/configuration_encodec.py
+++ b/src/transformers/models/encodec/configuration_encodec.py
@@ -38,7 +38,7 @@ class EncodecConfig(PretrainedConfig):
 
     Args:
         target_bandwidths (`List[float]`, *optional*, defaults to `[1.5, 3.0, 6.0, 12.0, 24.0]`):
-            The range of diffent bandwiths the model can encode audio with.
+            The range of different bandwidths the model can encode audio with.
         sampling_rate (`int`, *optional*, defaults to 24000):
             The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
         audio_channels (`int`, *optional*, defaults to 1):
diff --git a/src/transformers/models/encodec/modeling_encodec.py b/src/transformers/models/encodec/modeling_encodec.py
index 670ac99e03..ba699d745e 100644
--- a/src/transformers/models/encodec/modeling_encodec.py
+++ b/src/transformers/models/encodec/modeling_encodec.py
@@ -503,7 +503,7 @@ ENCODEC_START_DOCSTRING = r"""
 ENCODEC_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
-            Raw audio input converted to Float and padded to the approriate length in order to be encoded using chunks
+            Raw audio input converted to Float and padded to the appropriate length in order to be encoded using chunks
             of length self.chunk_length and a stride of `config.chunk_stride`.
         padding_mask (`torch.BoolTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
             Mask to avoid computing scaling factors on padding token indices (can we avoid computing conv on these+).
diff --git a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
index a5eff83e55..af57b2596c 100644
--- a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
@@ -78,7 +78,7 @@ class EncoderDecoderConfig(PretrainedConfig):
         super().__init__(**kwargs)
         if "encoder" not in kwargs or "decoder" not in kwargs:
             raise ValueError(
-                f"A configuraton of type {self.model_type} cannot be instantiated because "
+                f"A configuration of type {self.model_type} cannot be instantiated because "
                 f"both `encoder` and `decoder` sub-configurations were not passed, only {kwargs}"
             )
         encoder_config = kwargs.pop("encoder")
diff --git a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
index bdc589484c..ccb0aa0a6d 100644
--- a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
@@ -784,7 +784,7 @@ class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
                       [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
 
             model_args (remaining positional arguments, *optional*):
-                All remaning positional arguments will be passed to the underlying model's `__init__` method.
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
 
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index a5abafc361..9926f8d10f 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -341,7 +341,7 @@ class TFEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLoss):
                       `decoder_from_pt` should be set to `True`.
 
             model_args (remaining positional arguments, *optional*):
-                All remaning positional arguments will be passed to the underlying model's `__init__` method.
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
 
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py
index 74076eddf2..8526771d79 100644
--- a/src/transformers/models/flava/modeling_flava.py
+++ b/src/transformers/models/flava/modeling_flava.py
@@ -604,7 +604,7 @@ class FlavaLayer(nn.Module):
         self.intermediate = FlavaIntermediate(config)
         self.output = FlavaOutput(config)
 
-        # TODO: Check fp32 layer norm possiblity
+        # TODO: Check fp32 layer norm possibility
         self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
diff --git a/src/transformers/models/funnel/tokenization_funnel.py b/src/transformers/models/funnel/tokenization_funnel.py
index 8cb6f1af0e..d1c624a88d 100644
--- a/src/transformers/models/funnel/tokenization_funnel.py
+++ b/src/transformers/models/funnel/tokenization_funnel.py
@@ -495,7 +495,7 @@ class WordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
             text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 78a40f6d53..314fba427a 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -329,7 +329,7 @@ class GPT2Attention(nn.Module):
             else:
                 # Attention functions are consistent with previous equivalent attention classes, however they do not support some options
                 # (e.g. layer scaling, head mask) that eager supports. These implementations are thus equivalent to previous code, but
-                # not necessarily to eager (if mentionned options are provided).
+                # not necessarily to eager (if mentioned options are provided).
                 attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
         if using_eager and self.reorder_and_upcast_attn:
diff --git a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
index 19b0fd2375..194fff7dd6 100644
--- a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
@@ -195,7 +195,7 @@ class GPTNeoXJapaneseTokenizer(PreTrainedTokenizer):
 
 class SubWordJapaneseTokenizer:
     """
-    https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT Lisence according to the
+    https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT License according to the
     original repository.
 
     MIT License
diff --git a/src/transformers/models/granitemoe/configuration_granitemoe.py b/src/transformers/models/granitemoe/configuration_granitemoe.py
index 9ef029f95c..79f3d72cc8 100644
--- a/src/transformers/models/granitemoe/configuration_granitemoe.py
+++ b/src/transformers/models/granitemoe/configuration_granitemoe.py
@@ -97,9 +97,9 @@ class GraniteMoeConfig(PretrainedConfig):
         num_local_experts (`int`, *optional*, defaults to 8): total number of experts
         num_experts_per_tok (`int`, *optional*, defaults to 2): number of experts per token
         output_router_logits (`bool`, *optional*, defaults to `False`):
-            Whether or not the router logits should be returned by the model. Enabeling this will also
+            Whether or not the router logits should be returned by the model. Enabling this will also
             allow the model to output the auxiliary loss.
-        router_aux_loss_coef (`float`, *optional*, defaults to 0.001): router auxialiary loss coefficient
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001): router auxiliary loss coefficient
 
     ```python
     >>> from transformers import GraniteMoeModel, GraniteMoeConfig
diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py
index d417535db7..26496f7d0e 100644
--- a/src/transformers/models/granitemoe/modeling_granitemoe.py
+++ b/src/transformers/models/granitemoe/modeling_granitemoe.py
@@ -237,11 +237,12 @@ class GraniteMoeParallelExperts(nn.Module):
     def __init__(self, num_experts: int, input_size: int, output_size: int) -> None:
         """
         Initialize the GraniteMoeParallelExperts module.
-        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's comptible with
+        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
         many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
         [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
         [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
         used in vllm.
+
         Args:
             num_experts (int):
                 Number of experts.
@@ -259,11 +260,13 @@ class GraniteMoeParallelExperts(nn.Module):
     def forward(self, inputs, expert_size):
         """
         Forward pass of the GraniteMoeParallelExperts module.
+
         Args:
             inputs (Tensor):
                 Input tensor.
             expert_size:
                 Expert size information.
+
         Returns:
             Tensor: Output tensor.
         """
diff --git a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py
index 49df8e0bdf..32b55c69f3 100644
--- a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py
+++ b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py
@@ -97,9 +97,9 @@ class GraniteMoeSharedConfig(PretrainedConfig):
         num_local_experts (`int`, *optional*, defaults to 8): total number of experts
         num_experts_per_tok (`int`, *optional*, defaults to 2): number of experts per token
         output_router_logits (`bool`, *optional*, defaults to `False`):
-            Whether or not the router logits should be returned by the model. Enabeling this will also
+            Whether or not the router logits should be returned by the model. Enabling this will also
             allow the model to output the auxiliary loss.
-        router_aux_loss_coef (`float`, *optional*, defaults to 0.001): router auxialiary loss coefficient
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001): router auxiliary loss coefficient
         shared_intermediate_size (`int`, *optional*, defaults to 0): intermediate size for shared experts. 0 implies
             no shared experts.
 
diff --git a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py
index 59d679fb3c..2ca60e007b 100644
--- a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py
+++ b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py
@@ -105,11 +105,12 @@ class GraniteMoeSharedParallelExperts(nn.Module):
     def __init__(self, num_experts: int, input_size: int, output_size: int) -> None:
         """
         Initialize the GraniteMoeSharedParallelExperts module.
-        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's comptible with
+        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
         many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
         [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
         [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
         used in vllm.
+
         Args:
             num_experts (int):
                 Number of experts.
@@ -127,11 +128,13 @@ class GraniteMoeSharedParallelExperts(nn.Module):
     def forward(self, inputs, expert_size):
         """
         Forward pass of the GraniteMoeSharedParallelExperts module.
+
         Args:
             inputs (Tensor):
                 Input tensor.
             expert_size:
                 Expert size information.
+
         Returns:
             Tensor: Output tensor.
         """
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index a238c1dc1d..6d2a2a8df2 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -262,7 +262,7 @@ class GroundingDinoObjectDetectionOutput(ModelOutput):
             possible padding). You can use [`~GroundingDinoProcessor.post_process_grounded_object_detection`] to retrieve the
             unnormalized bounding boxes.
         auxiliary_outputs (`List[Dict]`, *optional*):
-            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
             `pred_boxes`) for each decoder layer.
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
@@ -2098,9 +2098,9 @@ class GroundingDinoModel(GroundingDinoPreTrainedModel):
         _, height, width = mask.shape
         valid_height = torch.sum(mask[:, :, 0], 1)
         valid_width = torch.sum(mask[:, 0, :], 1)
-        valid_ratio_heigth = valid_height.float() / height
+        valid_ratio_height = valid_height.float() / height
         valid_ratio_width = valid_width.float() / width
-        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1)
+        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
         return valid_ratio
 
     def generate_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
@@ -2136,8 +2136,8 @@ class GroundingDinoModel(GroundingDinoPreTrainedModel):
 
             scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
             grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
-            width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level)
-            proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4)
+            width_height = torch.ones_like(grid) * 0.05 * (2.0**level)
+            proposal = torch.cat((grid, width_height), -1).view(batch_size, -1, 4)
             proposals.append(proposal)
             current_position += height * width
 
diff --git a/src/transformers/models/hubert/configuration_hubert.py b/src/transformers/models/hubert/configuration_hubert.py
index 2d0d2af79e..36d41bfc57 100644
--- a/src/transformers/models/hubert/configuration_hubert.py
+++ b/src/transformers/models/hubert/configuration_hubert.py
@@ -106,7 +106,7 @@ class HubertConfig(PretrainedConfig):
             Recognition](https://arxiv.org/abs/1904.08779).
         mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            procedure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
             reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
             masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
             actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
@@ -118,7 +118,7 @@ class HubertConfig(PretrainedConfig):
             mask_time_min_masks''
         mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            masking procedure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
             the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
             span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
             may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py
index 9b7d260300..c243ebde9e 100644
--- a/src/transformers/models/idefics/modeling_idefics.py
+++ b/src/transformers/models/idefics/modeling_idefics.py
@@ -199,7 +199,7 @@ def freeze_model(model, module_exceptions=[]):
     module_exceptions_mapped = [mapping[m] for m in module_exceptions]
     for module in model.modules():
         if module_exceptions and any(isinstance(module, t) for t in module_exceptions_mapped):
-            module.requires_grad_(True)  # Explicitely setting it to true to avoid any mistakes
+            module.requires_grad_(True)  # Explicitly setting it to true to avoid any mistakes
         else:
             module.requires_grad_(False)
     return model
@@ -1235,7 +1235,7 @@ class IdeficsModel(IdeficsPreTrainedModel):
             image_attention_mask = None
 
         # cross_attention_gate:
-        # For any tokens attending to no images, the hidden_states comming out of the cross-attention should be zeroed-out.
+        # For any tokens attending to no images, the hidden_states coming out of the cross-attention should be zeroed-out.
         # `image_attention_mask` has shape [bsz, 1, num_images, hidden_size] with elements equal to either 0.0 or a very negative number.
         # If any of the elements are 0.0, then the token is attending to at least one image and the gate value is 1. Otherwise the gate value is 0.
         # `cross_attention_gate` has shape [bsz, seq_len] with elements equal to either 0.0 or 1.0.
diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py
index e67a0845a7..37876080df 100644
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@@ -318,7 +318,7 @@ class IdeficsProcessor(ProcessorMixin):
         and the two images will be massaged using [`IdeficsImageProcessor.__call__`] method and placed inside the
         `pixel_values` dict entry of the return value.
 
-        This example also examplifies that images can be passed as objects or as text urls. It can be seen that the
+        This example also exemplifies that images can be passed as objects or as text urls. It can be seen that the
         first image is passed as object and the second one as a url.
 
         To do training do:
diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py
index 64d939e7b4..b72b6f0b53 100755
--- a/src/transformers/models/jamba/modeling_jamba.py
+++ b/src/transformers/models/jamba/modeling_jamba.py
@@ -598,7 +598,7 @@ class JambaMambaMixer(nn.Module):
 
         # projection of the input hidden states
         self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=self.use_bias)
-        # selective projection used to make dt, B and C input dependant
+        # selective projection used to make dt, B and C input dependent
         self.x_proj = nn.Linear(self.intermediate_size, self.time_step_rank + self.ssm_state_size * 2, bias=False)
         # time step projection (discretization)
         self.dt_proj = nn.Linear(self.time_step_rank, self.intermediate_size, bias=True)
@@ -1547,7 +1547,7 @@ class JambaForCausalLM(JambaPreTrainedModel, GenerationMixin):
         use_cache=True,
         **kwargs,
     ):
-        # Overwitten -- has a unique cache type, `HybridMambaAttentionDynamicCache`
+        # Overwritten -- has a unique cache type, `HybridMambaAttentionDynamicCache`
 
         empty_past_kv = past_key_values is None
 
diff --git a/src/transformers/models/jetmoe/configuration_jetmoe.py b/src/transformers/models/jetmoe/configuration_jetmoe.py
index 4053c900bf..5846ea369d 100644
--- a/src/transformers/models/jetmoe/configuration_jetmoe.py
+++ b/src/transformers/models/jetmoe/configuration_jetmoe.py
@@ -57,7 +57,7 @@ class JetMoeConfig(PretrainedConfig):
         num_experts_per_tok (`int, *optional*, defaults to 2):
             The number of experts to route per-token and for MoE and MoA.
         output_router_logits (`bool`, *optional*, defaults to `False`):
-            Whether or not the router logits should be returned by the model. Enabeling this will also
+            Whether or not the router logits should be returned by the model. Enabling this will also
             allow the model to output the auxiliary loss.
         aux_loss_coef (`float`, *optional*, defaults to 0.01):
             The coefficient for the auxiliary loss.
diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py
index cabebb90ef..180f90676b 100644
--- a/src/transformers/models/jetmoe/modeling_jetmoe.py
+++ b/src/transformers/models/jetmoe/modeling_jetmoe.py
@@ -147,7 +147,7 @@ class JetMoeParallelExperts(nn.Module):
     def __init__(self, num_experts: int, input_size: int, output_size: int) -> None:
         """
         Initialize the JetMoeParallelExperts module.
-        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's comptible with
+        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
         many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
         [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
         [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm.py b/src/transformers/models/layoutlm/tokenization_layoutlm.py
index a0b61c93ac..135ca2f68a 100644
--- a/src/transformers/models/layoutlm/tokenization_layoutlm.py
+++ b/src/transformers/models/layoutlm/tokenization_layoutlm.py
@@ -465,7 +465,7 @@ class WordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
             text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
index d324c1ac7d..1fa23c32ae 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
@@ -1521,7 +1521,7 @@ class WordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
             text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
index 8c79ae42f0..d55a9d9b0e 100644
--- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
@@ -448,7 +448,7 @@ class LayoutLMv3SelfAttention(nn.Module):
             attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
-        # Use the trick of the CogView paper to stablize training
+        # Use the trick of the CogView paper to stabilize training
         attention_probs = self.cogview_attention(attention_scores)
 
         # This is actually dropping out entire tokens to attend to, which might
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index 9f80075338..be56ef1bb1 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -2467,7 +2467,7 @@ class LEDForSequenceClassification(LEDPreTrainedModel):
     def __init__(self, config: LEDConfig, **kwargs):
         warnings.warn(
             "The `transformers.LEDForSequenceClassification` class is deprecated and will be removed in version 5 of"
-            " Transformers. No actual method were provided in the original paper on how to perfom"
+            " Transformers. No actual method were provided in the original paper on how to perform"
             " sequence classification.",
             FutureWarning,
         )
diff --git a/src/transformers/models/llama4/convert_llama4_weights_to_hf.py b/src/transformers/models/llama4/convert_llama4_weights_to_hf.py
index 75b0ab8363..923d9ffc63 100644
--- a/src/transformers/models/llama4/convert_llama4_weights_to_hf.py
+++ b/src/transformers/models/llama4/convert_llama4_weights_to_hf.py
@@ -239,7 +239,7 @@ def write_model(
 
     config_kwargs = {}
     if params["use_scaled_rope"]:
-        # some constans from original code
+        # some constants from original code
         rope_scaling = {
             "rope_type": "llama3",
             "factor": 8.0,
@@ -288,7 +288,7 @@ def write_model(
         for_llm_compressor=_OFFLINE_QUANT_COMPATIBLE,
         **config_kwargs,
     )
-    # default vision config frmo params
+    # default vision config from params
 
     vision_params = params["vision_args"]
     vision_dim = vision_params["dim"]
diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py
index d1611a2fe0..0959199c2e 100644
--- a/src/transformers/models/llama4/modeling_llama4.py
+++ b/src/transformers/models/llama4/modeling_llama4.py
@@ -857,7 +857,7 @@ class Llama4TextModel(Llama4PreTrainedModel):
         '?'         :  5 ⬚ ⬚ ⬚ ■ ■ ■     |
 
         If the chunk size is 3.
-        This can just be appplied over the already created attention mask
+        This can just be applied over the already created attention mask
         """
         arange_vector = torch.arange(start, end, device=device)
         block_pos = torch.abs(
@@ -894,7 +894,7 @@ class Llama4TextModel(Llama4PreTrainedModel):
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
diff --git a/src/transformers/models/llava/convert_llava_weights_to_hf.py b/src/transformers/models/llava/convert_llava_weights_to_hf.py
index dafbf8bf2f..33dbe37d58 100644
--- a/src/transformers/models/llava/convert_llava_weights_to_hf.py
+++ b/src/transformers/models/llava/convert_llava_weights_to_hf.py
@@ -72,7 +72,7 @@ def load_original_state_dict(model_id):
                 for key in f.keys():
                     original_state_dict[key] = f.get_tensor(key)
 
-    # tied wieghts so lm.head is not saved. Let's clone to load state dict
+    # tied weights so lm.head is not saved. Let's clone to load state dict
     if "lm_head.weight" not in original_state_dict:
         original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone()
 
@@ -127,7 +127,7 @@ def convert_llava_llama_to_hf(text_model_id, vision_model_id, output_hub_path, o
         vision_config=vision_config,
     )
 
-    # llms-lab interleeave models do not use any selection startegy except for last hidden state
+    # llms-lab interleave models do not use any selection strategy except for last hidden state
     if "Qwen" in text_model_id:
         config.image_token_id = 151646
         if "siglip" in vision_model_id:
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index 72a61bff71..be98c0a3e5 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -53,7 +53,7 @@ class LlavaProcessor(ProcessorMixin):
             Patch size from the vision tower.
         vision_feature_select_strategy (`str`, *optional*):
             The feature selection strategy used to select the vision feature from the vision backbone.
-            Shoudl be same as in model's config
+            Should be same as in model's config
         chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
             in a chat into a tokenizable string.
         image_token (`str`, *optional*, defaults to `"<image>"`):
diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py
index c212a549fc..63246e8a53 100644
--- a/src/transformers/models/llava_next/image_processing_llava_next.py
+++ b/src/transformers/models/llava_next/image_processing_llava_next.py
@@ -688,7 +688,7 @@ class LlavaNextImageProcessor(BaseImageProcessor):
         image_sizes = [get_image_size(image, channel_dim=input_data_format) for image in images]
         for image in images:
             # convert image into a list of patches
-            # we intentially use the same data format as the input data format
+            # we intentionally use the same data format as the input data format
             image_patches = self.get_image_patches(
                 image,
                 image_grid_pinpoints,
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index 78175adc21..e1409b5d1d 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -56,7 +56,7 @@ class LlavaNextProcessor(ProcessorMixin):
             Patch size from the vision tower.
         vision_feature_select_strategy (`str`, *optional*):
             The feature selection strategy used to select the vision feature from the vision backbone.
-            Shoudl be same as in model's config
+            Should be same as in model's config
         chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
             in a chat into a tokenizable string.
         image_token (`str`, *optional*, defaults to `"<image>"`):
diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py
index c7ff0a1d7a..12ba442175 100644
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -64,7 +64,7 @@ class LlavaNextVideoProcessor(ProcessorMixin):
             Patch size from the vision tower.
         vision_feature_select_strategy (`str`, *optional*):
             The feature selection strategy used to select the vision feature from the vision backbone.
-            Shoudl be same as in model's config
+            Should be same as in model's config
         video_token (`str`, *optional*, defaults to `"<video>"`):
             Special token used to denote video location.
         image_token (`str`, *optional*, defaults to `"<image>"`):
diff --git a/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py b/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py
index bd8b9e3c4c..8b0ebe03bf 100644
--- a/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py
+++ b/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py
@@ -69,7 +69,7 @@ def load_original_state_dict(model_id):
                 for key in f.keys():
                     original_state_dict[key] = f.get_tensor(key)
 
-    # tied wieghts so lm.head is not saved. Let's clone to load state dict
+    # tied weights so lm.head is not saved. Let's clone to load state dict
     if "lm_head.weight" not in original_state_dict:
         original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone()
 
diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
index 23e03483f2..5a9bb5e3ea 100644
--- a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
@@ -132,7 +132,7 @@ class LlavaOnevisionImageProcessor(BaseImageProcessor):
         image_grid_pinpoints (`List` *optional*, defaults to `[[672, 336], [336, 672], [672, 672], [336, 1008], [1008, 336]]`):
             A list of possible resolutions to use for processing high resolution images. The best resolution is selected
             based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
-            method. Not used for processinf videos.
+            method. Not used for processing videos.
         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
@@ -647,7 +647,7 @@ class LlavaOnevisionImageProcessor(BaseImageProcessor):
         image_sizes = [get_image_size(image, channel_dim=input_data_format) for image in images]
         for image in images:
             # convert image into a list of patches
-            # we intentially use the same data format as the input data format
+            # we intentionally use the same data format as the input data format
             size_tuple = (
                 (size["height"], size["width"])
                 if "height" in size and "width" in size
diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py
index 51d8dcf9c0..753abc924a 100644
--- a/src/transformers/models/llava_onevision/processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@@ -63,7 +63,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
             Number of image tokens for one imagethat will be returned by vision tower.
         vision_feature_select_strategy (`str`, *optional*):
             The feature selection strategy used to select the vision feature from the vision backbone.
-            Shoudl be same as in model's config
+            Should be same as in model's config
         chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
             in a chat into a tokenizable string.
         image_token (`str`, *optional*, defaults to `"<image>"`):
diff --git a/src/transformers/models/longt5/configuration_longt5.py b/src/transformers/models/longt5/configuration_longt5.py
index 9acac4e447..f54ec03c8f 100644
--- a/src/transformers/models/longt5/configuration_longt5.py
+++ b/src/transformers/models/longt5/configuration_longt5.py
@@ -54,7 +54,7 @@ class LongT5Config(PretrainedConfig):
         local_radius (`int`, *optional*, defaults to 127)
             Number of tokens to the left/right for each token to locally self-attend in a local attention mechanism.
         global_block_size (`int`, *optional*, defaults to 16)
-            Lenght of blocks an input sequence is divided into for a global token representation. Used only for
+            Length of blocks an input sequence is divided into for a global token representation. Used only for
             `encoder_attention_type = "transient-global"`.
         relative_attention_num_buckets (`int`, *optional*, defaults to 32):
             The number of buckets to use for each attention layer.
diff --git a/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py b/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py
index cf5c2d52d8..d997971073 100644
--- a/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py
+++ b/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py
@@ -198,7 +198,7 @@ def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name, flax_dump_f
         flax_model.params["lm_head"]["kernel"] = t5x_model["target"]["decoder"]["logits_dense"]["kernel"]
 
     flax_model.save_pretrained(flax_dump_folder_path)
-    print("T5X Model was sucessfully converted!")
+    print("T5X Model was successfully converted!")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/longt5/modeling_flax_longt5.py b/src/transformers/models/longt5/modeling_flax_longt5.py
index 7c5fdf9c17..1301704414 100644
--- a/src/transformers/models/longt5/modeling_flax_longt5.py
+++ b/src/transformers/models/longt5/modeling_flax_longt5.py
@@ -145,7 +145,7 @@ def _get_local_attention_mask(attention_mask: np.ndarray, block_len: int) -> jnp
 def _make_global_fixed_block_ids(attention_mask: np.ndarray, global_block_size: int) -> Tuple[jnp.ndarray, np.ndarray]:
     """Obtain the "fixed block" global id corresponding to each input token.
 
-    This implementation is a simlified version of the original Flaxformr implementation adopted from:
+    This implementation is a simplified version of the original Flaxformr implementation adopted from:
     https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.
 
     In our scenario, as we use this strategy only for a decoder, orphan tokens, i.e. those tokens which do not make for
diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py
index b4434eae50..a509df9df2 100644
--- a/src/transformers/models/longt5/modeling_longt5.py
+++ b/src/transformers/models/longt5/modeling_longt5.py
@@ -158,7 +158,7 @@ def _make_global_fixed_block_ids(
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Obtain the "fixed block" global id corresponding to each input token.
 
-    This implementation is a simlified version of the original Flaxformr implementation adopted from:
+    This implementation is a simplified version of the original Flaxformr implementation adopted from:
     https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.
 
     In our scenario, as we use this strategy only for a decoder, orphan tokens, i.e. those tokens which do not make for
diff --git a/src/transformers/models/lxmert/configuration_lxmert.py b/src/transformers/models/lxmert/configuration_lxmert.py
index c092d01148..18d3d2e60d 100644
--- a/src/transformers/models/lxmert/configuration_lxmert.py
+++ b/src/transformers/models/lxmert/configuration_lxmert.py
@@ -78,7 +78,7 @@ class LxmertConfig(PretrainedConfig):
             This represents the last dimension of the pooled-object features used as input for the model, representing
             the size of each object feature itself.
         visual_pos_dim (`int`, *optional*, defaults to 4):
-            This represents the number of spacial features that are mixed into the visual features. The default is set
+            This represents the number of spatial features that are mixed into the visual features. The default is set
             to 4 because most commonly this will represent the location of a bounding box. i.e., (x, y, width, height)
         visual_loss_normalizer (`float`, *optional*, defaults to 6.67):
             This represents the scaling factor in which each visual loss is multiplied by if during pretraining, one
diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py
index 1b8fb938a2..4f25a67177 100644
--- a/src/transformers/models/lxmert/modeling_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_lxmert.py
@@ -832,8 +832,8 @@ LXMERT_INPUTS_DOCSTRING = r"""
 
             These are currently not provided by the transformers library.
         visual_pos (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_pos_dim)`):
-            This input represents spacial features corresponding to their relative (via index) visual features. The
-            pre-trained LXMERT model expects these spacial features to be normalized bounding boxes on a scale of 0 to
+            This input represents spatial features corresponding to their relative (via index) visual features. The
+            pre-trained LXMERT model expects these spatial features to be normalized bounding boxes on a scale of 0 to
             1.
 
             These are currently not provided by the transformers library.
diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py
index bd07c49f49..0efea46329 100644
--- a/src/transformers/models/lxmert/modeling_tf_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py
@@ -1036,8 +1036,8 @@ LXMERT_INPUTS_DOCSTRING = r"""
 
             These are currently not provided by the transformers library.
         visual_pos (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
-            This input represents spacial features corresponding to their relative (via index) visual features. The
-            pre-trained LXMERT model expects these spacial features to be normalized bounding boxes on a scale of 0 to
+            This input represents spatial features corresponding to their relative (via index) visual features. The
+            pre-trained LXMERT model expects these spatial features to be normalized bounding boxes on a scale of 0 to
             1.
 
             These are currently not provided by the transformers library.
diff --git a/src/transformers/models/lxmert/tokenization_lxmert.py b/src/transformers/models/lxmert/tokenization_lxmert.py
index 2dea92f7e0..dcf2f80061 100644
--- a/src/transformers/models/lxmert/tokenization_lxmert.py
+++ b/src/transformers/models/lxmert/tokenization_lxmert.py
@@ -464,7 +464,7 @@ class WordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
             text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/mamba/configuration_mamba.py b/src/transformers/models/mamba/configuration_mamba.py
index 6bf80e5fe9..4f6c17af01 100644
--- a/src/transformers/models/mamba/configuration_mamba.py
+++ b/src/transformers/models/mamba/configuration_mamba.py
@@ -80,7 +80,7 @@ class MambaConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the cache should be used.
         use_mambapy (`bool`, *optional*, defaults to `False`):
-            Determines the fallback strategy during training if the CUDA-based official implementation of Mamba is not avaiable. If `True`, the mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited.
+            Determines the fallback strategy during training if the CUDA-based official implementation of Mamba is not available. If `True`, the mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited.
 
 
     Example:
diff --git a/src/transformers/models/mamba2/modeling_mamba2.py b/src/transformers/models/mamba2/modeling_mamba2.py
index a1ca8d095c..99f7ebbb88 100644
--- a/src/transformers/models/mamba2/modeling_mamba2.py
+++ b/src/transformers/models/mamba2/modeling_mamba2.py
@@ -274,7 +274,7 @@ class Mamba2Mixer(nn.Module):
             projection_size,
             bias=config.use_bias,
         )
-        # selective projection used to make dt, B and C input dependant
+        # selective projection used to make dt, B and C input dependent
 
         # time step projection (discretization)
         # instantiate once and copy inv_dt in init_weights of PretrainedModel
@@ -1032,7 +1032,7 @@ class Mamba2ForCausalLM(Mamba2PreTrainedModel, GenerationMixin):
         attention_mask: Optional[torch.Tensor] = None,
         **kwargs,
     ):
-        # Overwitten -- uses `cache_params` as opposed to `past_key_values`
+        # Overwritten -- uses `cache_params` as opposed to `past_key_values`
 
         if use_cache:
             # `cache_position` should have been initialized in `generate`
diff --git a/src/transformers/models/marian/convert_marian_to_pytorch.py b/src/transformers/models/marian/convert_marian_to_pytorch.py
index 4cc9b15ce4..fd38b2e17c 100644
--- a/src/transformers/models/marian/convert_marian_to_pytorch.py
+++ b/src/transformers/models/marian/convert_marian_to_pytorch.py
@@ -68,7 +68,7 @@ def find_pretrained_model(src_lang: str, tgt_lang: str) -> List[str]:
     model_ids = [x.id for x in model_list if x.id.startswith("Helsinki-NLP")]
     src_and_targ = [
         remove_prefix(m, prefix).lower().split("-") for m in model_ids if "+" not in m
-    ]  # + cant be loaded.
+    ]  # + can't be loaded.
     matching = [f"{prefix}{a}-{b}" for (a, b) in src_and_targ if src_lang in a and tgt_lang in b]
     return matching
 
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index affb6b8b67..5eeab498a5 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -1257,7 +1257,7 @@ class MarianMTModel(MarianPreTrainedModel, GenerationMixin):
             self._resize_final_logits_bias(new_num_tokens)
         return new_embeddings
 
-    # NOTE: `_resize_token_embeddings` was rewriten in the base class, *args exists to absorb the extra arg
+    # NOTE: `_resize_token_embeddings` was rewritten in the base class, *args exists to absorb the extra arg
     def _resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of=None, *args) -> nn.Embedding:
         old_embeddings = self.get_input_embeddings()
         new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of)
diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
index 60d37ff35b..bdd48bc8f3 100644
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -448,7 +448,7 @@ class Mask2FormerHungarianMatcher(nn.Module):
             pred_probs = class_queries_logits[i].softmax(-1)
             pred_mask = masks_queries_logits[i]
 
-            # Compute the classification cost. Contrary to the loss, we don't use the NLL, but approximate it in 1 - proba[target class]. The 1 is a constant that doesn't change the matching, it can be ommitted.
+            # Compute the classification cost. Contrary to the loss, we don't use the NLL, but approximate it in 1 - proba[target class]. The 1 is a constant that doesn't change the matching, it can be omitted.
             cost_class = -pred_probs[:, class_labels[i]]
             target_mask = mask_labels[i].to(pred_mask)
             target_mask = target_mask[:, None]
@@ -465,7 +465,7 @@ class Mask2FormerHungarianMatcher(nn.Module):
 
             # compute the cross entropy loss between each mask pairs -> shape (num_queries, num_labels)
             cost_mask = pair_wise_sigmoid_cross_entropy_loss(pred_mask, target_mask)
-            # Compute the dice loss betwen each mask pairs -> shape (num_queries, num_labels)
+            # Compute the dice loss between each mask pairs -> shape (num_queries, num_labels)
             cost_dice = pair_wise_dice_loss(pred_mask, target_mask)
             # final cost matrix
             cost_matrix = self.cost_mask * cost_mask + self.cost_class * cost_class + self.cost_dice * cost_dice
@@ -473,7 +473,7 @@ class Mask2FormerHungarianMatcher(nn.Module):
             cost_matrix = torch.minimum(cost_matrix, torch.tensor(1e10))
             cost_matrix = torch.maximum(cost_matrix, torch.tensor(-1e10))
             cost_matrix = torch.nan_to_num(cost_matrix, 0)
-            # do the assigmented using the hungarian algorithm in scipy
+            # do the assignment using the hungarian algorithm in scipy
             assigned_indices: Tuple[np.array] = linear_sum_assignment(cost_matrix.cpu())
             indices.append(assigned_indices)
 
@@ -1275,9 +1275,9 @@ class Mask2FormerPixelDecoder(nn.Module):
         _, height, width = mask.shape
         valid_height = torch.sum(~mask[:, :, 0], 1)
         valid_width = torch.sum(~mask[:, 0, :], 1)
-        valid_ratio_heigth = valid_height.to(dtype) / height
+        valid_ratio_height = valid_height.to(dtype) / height
         valid_ratio_width = valid_width.to(dtype) / width
-        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1)
+        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
         return valid_ratio
 
     def forward(
diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index 4f83cc59da..e354c21c40 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -871,7 +871,7 @@ class MaskFormerHungarianMatcher(nn.Module):
             pred_probs = pred_probs.softmax(-1)
             # Compute the classification cost. Contrary to the loss, we don't use the NLL,
             # but approximate it in 1 - proba[target class].
-            # The 1 is a constant that doesn't change the matching, it can be ommitted.
+            # The 1 is a constant that doesn't change the matching, it can be omitted.
             cost_class = -pred_probs[:, labels]
             # flatten spatial dimension "q h w -> q (h w)"
             pred_mask_flat = pred_mask.flatten(1)  # [num_queries, height*width]
@@ -879,11 +879,11 @@ class MaskFormerHungarianMatcher(nn.Module):
             target_mask_flat = target_mask[:, 0].flatten(1)  # [num_total_labels, height*width]
             # compute the focal loss between each mask pairs -> shape (num_queries, num_labels)
             cost_mask = pair_wise_sigmoid_focal_loss(pred_mask_flat, target_mask_flat)
-            # Compute the dice loss betwen each mask pairs -> shape (num_queries, num_labels)
+            # Compute the dice loss between each mask pairs -> shape (num_queries, num_labels)
             cost_dice = pair_wise_dice_loss(pred_mask_flat, target_mask_flat)
             # final cost matrix
             cost_matrix = self.cost_mask * cost_mask + self.cost_class * cost_class + self.cost_dice * cost_dice
-            # do the assigmented using the hungarian algorithm in scipy
+            # do the assignment using the hungarian algorithm in scipy
             assigned_indices: Tuple[np.array] = linear_sum_assignment(cost_matrix.cpu())
             indices.append(assigned_indices)
 
@@ -923,7 +923,7 @@ class MaskFormerLoss(nn.Module):
             num_labels (`int`):
                 The number of classes.
             matcher (`MaskFormerHungarianMatcher`):
-                A torch module that computes the assigments between the predictions and labels.
+                A torch module that computes the assignments between the predictions and labels.
             weight_dict (`Dict[str, float]`):
                 A dictionary of weights to be applied to the different losses.
             eos_coef (`float`):
@@ -1085,7 +1085,7 @@ class MaskFormerLoss(nn.Module):
             - **loss_mask** -- The loss computed using sigmoid focal loss on the predicted and ground truth masks.
             - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth
               masks.
-            if `use_auxiliary_loss` was set to `true` in [`MaskFormerConfig`], the dictionary contains addional losses
+            if `use_auxiliary_loss` was set to `true` in [`MaskFormerConfig`], the dictionary contains additional losses
             for each auxiliary predictions.
         """
 
diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py
index fd25e84ffe..3cb72c6c84 100644
--- a/src/transformers/models/maskformer/modeling_maskformer_swin.py
+++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py
@@ -935,7 +935,7 @@ class MaskFormerSwinBackbone(MaskFormerSwinPreTrainedModel, BackboneMixin):
             zip(hidden_states, self.stage_names[1:], spatial_dimensions)
         ):
             norm = self.hidden_states_norms[i]
-            # the last element corespond to the layer's last block output but before patch merging
+            # the last element correspond to the layer's last block output but before patch merging
             hidden_state_unpolled = hidden_state[-1]
             hidden_state_norm = norm(hidden_state_unpolled)
             # the pixel decoder (FPN) expects 3D tensors (features)
diff --git a/src/transformers/models/mistral/modeling_tf_mistral.py b/src/transformers/models/mistral/modeling_tf_mistral.py
index 34b4da4c33..52f076a05a 100644
--- a/src/transformers/models/mistral/modeling_tf_mistral.py
+++ b/src/transformers/models/mistral/modeling_tf_mistral.py
@@ -326,7 +326,7 @@ class TFMistralAttention(keras.layers.Layer):
         )
 
         if past_key_value is not None:
-            # resue k, v, self_attention
+            # reuse k, v, self_attention
             key_states = tf.concat([past_key_value[0], key_states], axis=2)
             value_states = tf.concat([past_key_value[1], value_states], axis=2)
 
diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py
index c8f7eaccdb..066b045ee1 100644
--- a/src/transformers/models/mixtral/configuration_mixtral.py
+++ b/src/transformers/models/mixtral/configuration_mixtral.py
@@ -87,7 +87,7 @@ class MixtralConfig(PretrainedConfig):
         num_local_experts (`int`, *optional*, defaults to 8):
             Number of experts per Sparse MLP layer.
         output_router_logits (`bool`, *optional*, defaults to `False`):
-            Whether or not the router logits should be returned by the model. Enabeling this will also
+            Whether or not the router logits should be returned by the model. Enabling this will also
             allow the model to output the auxiliary loss. See [here]() for more details
         router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
             The aux loss factor for the total loss.
diff --git a/src/transformers/models/mllama/convert_mllama_weights_to_hf.py b/src/transformers/models/mllama/convert_mllama_weights_to_hf.py
index 2d361af61e..f5f338fd8d 100644
--- a/src/transformers/models/mllama/convert_mllama_weights_to_hf.py
+++ b/src/transformers/models/mllama/convert_mllama_weights_to_hf.py
@@ -234,7 +234,7 @@ def write_model(
     text_rope_theta = params["rope_theta"]
     cross_attention_num_layers = params["vision_num_cross_attention_layers"]
 
-    # some constans from original code
+    # some constants from original code
     rope_scaling = {
         "rope_type": "llama3",
         "factor": 8.0,
diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert.py b/src/transformers/models/mobilebert/tokenization_mobilebert.py
index 8a50f539da..1f9de55270 100644
--- a/src/transformers/models/mobilebert/tokenization_mobilebert.py
+++ b/src/transformers/models/mobilebert/tokenization_mobilebert.py
@@ -466,7 +466,7 @@ class WordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
             text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/modernbert/modeling_modernbert.py b/src/transformers/models/modernbert/modeling_modernbert.py
index accc07f6c3..974a4dc1e4 100644
--- a/src/transformers/models/modernbert/modeling_modernbert.py
+++ b/src/transformers/models/modernbert/modeling_modernbert.py
@@ -160,7 +160,7 @@ class ModernBertUnpaddedRotaryEmbedding(RotaryEmbedding):
         """
         max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache
             up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
-            the cos_sin_cache wll be recomputed during the forward pass.
+            the cos_sin_cache will be recomputed during the forward pass.
         """
         super().__init__(dim=dim, base=base, pos_idx_in_fp32=True, device=device, interleaved=False)
         self.max_seqlen = max_seqlen
diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py
index f687324c5a..9e74b4d787 100644
--- a/src/transformers/models/modernbert/modular_modernbert.py
+++ b/src/transformers/models/modernbert/modular_modernbert.py
@@ -423,7 +423,7 @@ class ModernBertUnpaddedRotaryEmbedding(RotaryEmbedding):
         """
         max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache
             up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
-            the cos_sin_cache wll be recomputed during the forward pass.
+            the cos_sin_cache will be recomputed during the forward pass.
         """
         super().__init__(dim=dim, base=base, pos_idx_in_fp32=True, device=device, interleaved=False)
         self.max_seqlen = max_seqlen
diff --git a/src/transformers/models/mpnet/tokenization_mpnet.py b/src/transformers/models/mpnet/tokenization_mpnet.py
index 2c8da3b41c..0aca1cf261 100644
--- a/src/transformers/models/mpnet/tokenization_mpnet.py
+++ b/src/transformers/models/mpnet/tokenization_mpnet.py
@@ -490,7 +490,7 @@ class WordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
             text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/mpt/modeling_mpt.py b/src/transformers/models/mpt/modeling_mpt.py
index 4de9999039..bff4c0b06c 100644
--- a/src/transformers/models/mpt/modeling_mpt.py
+++ b/src/transformers/models/mpt/modeling_mpt.py
@@ -69,7 +69,7 @@ def build_mpt_alibi_tensor(num_heads, sequence_length, alibi_bias_max=8, device=
 
 class MptAttention(nn.Module):
     """Multi-head self attention.
-    Using torch or triton attention implemetation enables user to also use additive bias.
+    Using torch or triton attention implementation enables user to also use additive bias.
     """
 
     def __init__(self, config: MptConfig):
diff --git a/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py b/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
index b35cd7662d..cdee89efda 100644
--- a/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
+++ b/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
@@ -87,7 +87,7 @@ def convert_mra_checkpoint(checkpoint_path, mra_config_file, pytorch_dump_path):
     model.eval()
     model.save_pretrained(pytorch_dump_path)
 
-    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
+    print(f"Checkpoint successfully converted. Model saved at {pytorch_dump_path}")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
index 07a593ca2a..9170f947fe 100644
--- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
@@ -1693,7 +1693,7 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel, GenerationMixin):
         return self.text_encoder
 
     def get_encoder(self):
-        # get the text encoder to compute the conditionning hidden-states for generation
+        # get the text encoder to compute the conditioning hidden-states for generation
         return self.get_text_encoder()
 
     def get_decoder(self):
diff --git a/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
index 6664a7d8ad..7be4869181 100644
--- a/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
@@ -88,7 +88,7 @@ def convert_nystromformer_checkpoint(checkpoint_path, nystromformer_config_file,
     model.eval()
     model.save_pretrained(pytorch_dump_path)
 
-    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
+    print(f"Checkpoint successfully converted. Model saved at {pytorch_dump_path}")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/olmoe/configuration_olmoe.py b/src/transformers/models/olmoe/configuration_olmoe.py
index 0f24d5523b..2b85ea55e3 100644
--- a/src/transformers/models/olmoe/configuration_olmoe.py
+++ b/src/transformers/models/olmoe/configuration_olmoe.py
@@ -86,7 +86,7 @@ class OlmoeConfig(PretrainedConfig):
         num_experts (`int`, *optional*, defaults to 64):
             Number of routed experts.
         output_router_logits (`bool`, *optional*, defaults to `False`):
-            Whether or not the router logits should be returned by the model. Enabeling this will also
+            Whether or not the router logits should be returned by the model. Enabling this will also
             allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
         router_aux_loss_coef (`float`, *optional*, defaults to 0.01):
             The aux loss factor for the total loss.
diff --git a/src/transformers/models/oneformer/configuration_oneformer.py b/src/transformers/models/oneformer/configuration_oneformer.py
index 2f577d0d4d..47438cde41 100644
--- a/src/transformers/models/oneformer/configuration_oneformer.py
+++ b/src/transformers/models/oneformer/configuration_oneformer.py
@@ -74,7 +74,7 @@ class OneFormerConfig(PretrainedConfig):
         importance_sample_ratio (`float`, *optional*, defaults to 0.75):
             Ratio of points that are sampled via importance sampling.
         init_std (`float`, *optional*, defaults to 0.02):
-            Standard deviation for normal intialization.
+            Standard deviation for normal initialization.
         init_xavier_std (`float`, *optional*, defaults to 1.0):
             Standard deviation for xavier uniform initialization.
         layer_norm_eps (`float`, *optional*, defaults to 1e-05):
diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py
index 7c3ecb3611..212edc1c85 100644
--- a/src/transformers/models/oneformer/modeling_oneformer.py
+++ b/src/transformers/models/oneformer/modeling_oneformer.py
@@ -261,7 +261,7 @@ class OneFormerHungarianMatcher(nn.Module):
         """
         super().__init__()
         if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
-            raise ValueError("All costs cant be 0")
+            raise ValueError("All costs can't be 0")
         self.cost_class = cost_class
         self.cost_mask = cost_mask
         self.cost_dice = cost_dice
@@ -304,7 +304,7 @@ class OneFormerHungarianMatcher(nn.Module):
             pred_probs = pred_probs.softmax(-1)
             # Compute the classification cost. Contrary to the loss, we don't use the NLL,
             # but approximate it in 1 - proba[target class].
-            # The 1 is a constant that doesn't change the matching, it can be ommitted.
+            # The 1 is a constant that doesn't change the matching, it can be omitted.
             cost_class = -pred_probs[:, labels]
 
             pred_mask = pred_mask[:, None]
@@ -371,7 +371,7 @@ class OneFormerLoss(nn.Module):
             num_labels (`int`):
                 The number of classes.
             matcher (`OneFormerHungarianMatcher`):
-                A torch module that computes the assigments between the predictions and labels.
+                A torch module that computes the assignments between the predictions and labels.
             weight_dict (`Dict[str, float]`):
                 A dictionary of weights to be applied to the different losses.
             eos_coef (`float`):
@@ -684,7 +684,7 @@ class OneFormerLoss(nn.Module):
             - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth
               masks.
             - **loss_contrastive** -- The query-text contrstive loss computed using object and text queries.
-            if `use_auxiliary_loss` was set to `true` in [`OneFormerConfig`], the dictionary contains addional losses
+            if `use_auxiliary_loss` was set to `true` in [`OneFormerConfig`], the dictionary contains additional losses
             for each auxiliary predictions.
         """
 
@@ -1367,9 +1367,9 @@ class OneFormerPixelDecoder(nn.Module):
         _, height, width = mask.shape
         valid_height = torch.sum(~mask[:, :, 0], 1)
         valid_width = torch.sum(~mask[:, 0, :], 1)
-        valid_ratio_heigth = valid_height.to(dtype) / height
+        valid_ratio_height = valid_height.to(dtype) / height
         valid_ratio_width = valid_width.to(dtype) / width
-        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1)
+        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
         return valid_ratio
 
     def forward(
diff --git a/src/transformers/models/owlv2/image_processing_owlv2.py b/src/transformers/models/owlv2/image_processing_owlv2.py
index 71b88ef1ad..bc211d1fb4 100644
--- a/src/transformers/models/owlv2/image_processing_owlv2.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2.py
@@ -88,7 +88,7 @@ def _scale_boxes(boxes, target_sizes):
     else:
         raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor")
 
-    # for owlv2 image is padded to max size unlike owlvit, thats why we have to scale boxes to max size
+    # for owlv2 image is padded to max size unlike owlvit, that's why we have to scale boxes to max size
     max_size = torch.max(image_height, image_width)
 
     scale_factor = torch.stack([max_size, max_size, max_size, max_size], dim=1)
diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py
index 4996cae7ab..ce4a3ed49d 100644
--- a/src/transformers/models/owlv2/processing_owlv2.py
+++ b/src/transformers/models/owlv2/processing_owlv2.py
@@ -58,7 +58,7 @@ class Owlv2ProcessorKwargs(ProcessingKwargs, total=False):
 class Owlv2Processor(ProcessorMixin):
     r"""
     Constructs an Owlv2 processor which wraps [`Owlv2ImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`] into
-    a single processor that interits both the image processor and tokenizer functionalities. See the
+    a single processor that inherits both the image processor and tokenizer functionalities. See the
     [`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information.
 
     Args:
diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py
index 859e28bfcc..564b198ac1 100644
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -58,7 +58,7 @@ class OwlViTProcessorKwargs(ProcessingKwargs, total=False):
 class OwlViTProcessor(ProcessorMixin):
     r"""
     Constructs an OWL-ViT processor which wraps [`OwlViTImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`]
-    into a single processor that interits both the image processor and tokenizer functionalities. See the
+    into a single processor that inherits both the image processor and tokenizer functionalities. See the
     [`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information.
 
     Args:
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index ae09d410ac..95897db85f 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -669,7 +669,7 @@ class PatchTSTPositionalEncoding(nn.Module):
             # cls_token: [1 x num_input_channels x 1 x d_model]
             self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model))
             num_patches += 1
-        # postional encoding: [num_patches x d_model]
+        # positional encoding: [num_patches x d_model]
         self.position_enc = self._init_pe(config, num_patches)
         # Positional dropout
         self.positional_dropout = (
diff --git a/src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py b/src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py
index b1a4ac90ac..e4e2d27909 100644
--- a/src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py
+++ b/src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py
@@ -191,7 +191,7 @@ def convert_and_save_processor(input_dir: str, output_dir: str):
     # Save the processor
     converted_processor.save_pretrained(output_dir)
 
-    # we need to rename a few tokens but tokenizers doesn't allow doing that programatically
+    # we need to rename a few tokens but tokenizers doesn't allow doing that programmatically
     # To avoid consufion and manual renaming, the below part load and re-saved each json file
     vocab = json.load(open(f"{output_dir}/vocab.json", "r"))
     vocab["<|endoftext11|>"] = "<|audio|>"
diff --git a/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
index 5d29af6c8b..6f1a19fab6 100644
--- a/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
@@ -56,7 +56,7 @@ def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
         fmax = sample_rate / 2
     if fmin is None:
         fmin = 0
-    assert fmin >= 0, "fmin cannot be negtive"
+    assert fmin >= 0, "fmin cannot be negative"
     assert fmin < fmax <= sample_rate / 2, "fmax must be between (fmin, samplerate / 2]"
 
     def mel(f):
@@ -74,7 +74,7 @@ def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
 
     khi = max(khi, klo)
 
-    # Spec 2: SpeechLib uses trianges in Mel space
+    # Spec 2: SpeechLib uses triangles in Mel space
     mlo = mel(fmin)
     mhi = mel(fmax)
     m_centers = np.linspace(mlo, mhi, n_mels + 2)
diff --git a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py
index 72558e3f7c..f2f691139d 100644
--- a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py
@@ -1128,7 +1128,7 @@ class Phi4MultimodalAudioModel(Phi4MultimodalAudioPreTrainedModel):
         seq_len = math.ceil(hidden_states.shape[1] / self.config.time_reduction)
         if seq_len <= 0:
             raise ValueError(
-                f"The squence length after time reduction is invalid: {seq_len}. Your input feature is too short."
+                f"The sequence length after time reduction is invalid: {seq_len}. Your input feature is too short."
             )
 
         batch_size = hidden_states.shape[0]
@@ -1173,7 +1173,7 @@ class Phi4MultimodalAudioModel(Phi4MultimodalAudioPreTrainedModel):
 
         unfolded = False
         bs, seq_len, _ = hidden_states.shape
-        max_seq_len = 500  # maxium position for absolute positional encoding
+        max_seq_len = 500  # maximum position for absolute positional encoding
         if seq_len > max_seq_len:
             # audio sequence is longer than max_seq_len, unfold it into chunks of max_seq_len
             unfolded = True
diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
index b9d6bc0daf..d13fa8dc8b 100644
--- a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
@@ -1186,7 +1186,7 @@ class Phi4MultimodalAudioModel(Phi4MultimodalAudioPreTrainedModel):
         seq_len = math.ceil(hidden_states.shape[1] / self.config.time_reduction)
         if seq_len <= 0:
             raise ValueError(
-                f"The squence length after time reduction is invalid: {seq_len}. Your input feature is too short."
+                f"The sequence length after time reduction is invalid: {seq_len}. Your input feature is too short."
             )
 
         batch_size = hidden_states.shape[0]
@@ -1231,7 +1231,7 @@ class Phi4MultimodalAudioModel(Phi4MultimodalAudioPreTrainedModel):
 
         unfolded = False
         bs, seq_len, _ = hidden_states.shape
-        max_seq_len = 500  # maxium position for absolute positional encoding
+        max_seq_len = 500  # maximum position for absolute positional encoding
         if seq_len > max_seq_len:
             # audio sequence is longer than max_seq_len, unfold it into chunks of max_seq_len
             unfolded = True
diff --git a/src/transformers/models/pop2piano/feature_extraction_pop2piano.py b/src/transformers/models/pop2piano/feature_extraction_pop2piano.py
index 0b191ab106..e4a29de6a3 100644
--- a/src/transformers/models/pop2piano/feature_extraction_pop2piano.py
+++ b/src/transformers/models/pop2piano/feature_extraction_pop2piano.py
@@ -258,7 +258,7 @@ class Pop2PianoFeatureExtractor(SequenceFeatureExtractor):
             )
 
             if add_zero_line:
-                # if it is batched then we seperate each examples using zero array
+                # if it is batched then we separate each examples using zero array
                 zero_array_len = max([*zip(*features_shapes)][1])
 
                 # we concatenate the zero array line here
@@ -304,14 +304,14 @@ class Pop2PianoFeatureExtractor(SequenceFeatureExtractor):
             to it:
             - **attention_mask** numpy.ndarray of shape `(batch_size, max_input_features_seq_length)` --
                 Example :
-                    1, 1, 1, 0, 0 (audio 1, also here it is padded to max length of 5 thats why there are 2 zeros at
+                    1, 1, 1, 0, 0 (audio 1, also here it is padded to max length of 5 that's why there are 2 zeros at
                     the end indicating they are padded)
 
-                    0, 0, 0, 0, 0 (zero pad to seperate audio 1 and 2)
+                    0, 0, 0, 0, 0 (zero pad to separate audio 1 and 2)
 
                     1, 1, 1, 1, 1 (audio 2)
 
-                    0, 0, 0, 0, 0 (zero pad to seperate audio 2 and 3)
+                    0, 0, 0, 0, 0 (zero pad to separate audio 2 and 3)
 
                     1, 1, 1, 1, 1 (audio 3)
             - **attention_mask_beatsteps** numpy.ndarray of shape `(batch_size, max_beatsteps_seq_length)`
@@ -333,7 +333,7 @@ class Pop2PianoFeatureExtractor(SequenceFeatureExtractor):
                     processed_features_dict[f"attention_mask_{feature_name}"] = attention_mask
 
         # If we are processing only one example, we should remove the zero array line since we don't need it to
-        # seperate examples from each other.
+        # separate examples from each other.
         if not is_batched and not return_attention_mask:
             processed_features_dict["input_features"] = processed_features_dict["input_features"][:-1, ...]
 
diff --git a/src/transformers/models/pop2piano/tokenization_pop2piano.py b/src/transformers/models/pop2piano/tokenization_pop2piano.py
index 0a7a8a5586..22bf21e2f4 100644
--- a/src/transformers/models/pop2piano/tokenization_pop2piano.py
+++ b/src/transformers/models/pop2piano/tokenization_pop2piano.py
@@ -669,7 +669,7 @@ class Pop2PianoTokenizer(PreTrainedTokenizer):
                 )
 
         if attention_masks_present:
-            # check for zeros(since token_ids are seperated by zero arrays)
+            # check for zeros(since token_ids are separated by zero arrays)
             batch_idx = np.where(feature_extractor_output["attention_mask"][:, 0] == 0)[0]
         else:
             batch_idx = [token_ids.shape[0]]
diff --git a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py
index 898475835b..2940932fe5 100644
--- a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py
+++ b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py
@@ -102,22 +102,22 @@ class PromptDepthAnythingImageProcessor(BaseImageProcessor):
 
     Args:
         do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image's (height, width) dimensions. Can be overidden by `do_resize` in `preprocess`.
+            Whether to resize the image's (height, width) dimensions. Can be overridden by `do_resize` in `preprocess`.
         size (`Dict[str, int]` *optional*, defaults to `{"height": 384, "width": 384}`):
-            Size of the image after resizing. Can be overidden by `size` in `preprocess`.
+            Size of the image after resizing. Can be overridden by `size` in `preprocess`.
         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
-            Defines the resampling filter to use if resizing the image. Can be overidden by `resample` in `preprocess`.
+            Defines the resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
         keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
             If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
-            be overidden by `keep_aspect_ratio` in `preprocess`.
+            be overridden by `keep_aspect_ratio` in `preprocess`.
         ensure_multiple_of (`int`, *optional*, defaults to 1):
-            If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overidden
+            If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
             by `ensure_multiple_of` in `preprocess`.
         do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overidden by `do_rescale` in
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
             `preprocess`.
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Scale factor to use if rescaling the image. Can be overidden by `rescale_factor` in `preprocess`.
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in `preprocess`.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
             method.
diff --git a/src/transformers/models/prophetnet/tokenization_prophetnet.py b/src/transformers/models/prophetnet/tokenization_prophetnet.py
index 276dbb8438..51257f76db 100644
--- a/src/transformers/models/prophetnet/tokenization_prophetnet.py
+++ b/src/transformers/models/prophetnet/tokenization_prophetnet.py
@@ -213,7 +213,7 @@ class WordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
             text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/pvt_v2/convert_pvt_v2_to_pytorch.py b/src/transformers/models/pvt_v2/convert_pvt_v2_to_pytorch.py
index b315d540da..5e4509b249 100644
--- a/src/transformers/models/pvt_v2/convert_pvt_v2_to_pytorch.py
+++ b/src/transformers/models/pvt_v2/convert_pvt_v2_to_pytorch.py
@@ -35,7 +35,7 @@ logger = logging.get_logger(__name__)
 def create_rename_keys(config):
     rename_keys = []
     for i in range(config.num_encoder_blocks):
-        # Remane embedings' paramters
+        # Rename embeddings' parameters
         rename_keys.append(
             (f"patch_embed{i + 1}.proj.weight", f"pvt_v2.encoder.layers.{i}.patch_embedding.proj.weight")
         )
diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py
index bb1d27ea91..132b562f9a 100644
--- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py
@@ -154,7 +154,7 @@ class Qwen2_5OmniAudioEncoderConfig(PretrainedConfig):
         n_window (`int`, *optional*, defaults to 100):
             The chunk for conv and flash attn in AudioEncoder.
         output_dim (`int`, *optional*, defaults to 3584):
-            The output dimention of AudioEncoder.
+            The output dimension of AudioEncoder.
 
     Example:
 
@@ -979,7 +979,7 @@ class Qwen2_5OmniConfig(PretrainedConfig):
         thinker_config (`dict`, *optional*): Configuration of the underlying thinker sub-model.
         talker_config (`dict`, *optional*): Configuration of the underlying talker sub-model.
         token2wav_config (`dict`, *optional*): Configuration of the underlying codec sub-model.
-        enable_audio_output (`bool`, *optional*, defaults to `True`): Whether enabel audio output and load talker and token2wav module.
+        enable_audio_output (`bool`, *optional*, defaults to `True`): Whether enable audio output and load talker and token2wav module.
 
     Example:
 
@@ -1054,7 +1054,7 @@ class Qwen2_5OmniConfig(PretrainedConfig):
             decoder (`Optional[bool]`, *optional*, defaults to `False`):
                 If set to `True`, then only search for decoder config names.
         """
-        # Overriden for deeply nested config like Qwen2-Omni. We don't have any omni model
+        # Overridden for deeply nested config like Qwen2-Omni. We don't have any omni model
         # except for Qwen yet. This has to be generalized if more deeply nested configs are
         # added. NOTE: currently method used only by vLLM
         return self.thinker_config.get_text_config()
diff --git a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py
index e79f641de3..17551dc5ae 100644
--- a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py
@@ -180,7 +180,7 @@ class Qwen2_5OmniPreTrainedModelForConditionalGeneration(Qwen2_5OmniPreTrainedMo
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             min_dtype (`float`):
                 The minimum value representable with the dtype `dtype`.
             cache_position (`torch.Tensor`):
@@ -678,7 +678,7 @@ class Qwen2_5OmniAudioFlashAttention2(Qwen2_5OmniAudioAttention):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
 
@@ -4492,10 +4492,10 @@ class Qwen2_5OmniForConditionalGeneration(Qwen2_5OmniPreTrainedModel, Generation
                 - **Audio waveform** (`torch.Tensor`): Generated audio waveform.
         """
         if speaker not in self.speaker_map:
-            raise ValueError(f"{speaker} is not availible, availible speakers: {self.speaker_map.keys()}")
+            raise ValueError(f"{speaker} is not available, available speakers: {self.speaker_map.keys()}")
         if return_audio and not self.has_talker:
             raise ValueError(
-                "Cannot use talker when talker module not initalized. Use `enable_talker` method or set enable_talker in config to enable talker."
+                "Cannot use talker when talker module not initialized. Use `enable_talker` method or set enable_talker in config to enable talker."
             )
         if return_audio is None:
             return_audio = self.has_talker
diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py
index 294a72c6b6..9d3f2d9ec6 100644
--- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py
@@ -195,7 +195,7 @@ class Qwen2_5OmniAudioEncoderConfig(Qwen2AudioEncoderConfig):
         n_window (`int`, *optional*, defaults to 100):
             The chunk for conv and flash attn in AudioEncoder.
         output_dim (`int`, *optional*, defaults to 3584):
-            The output dimention of AudioEncoder.
+            The output dimension of AudioEncoder.
 
     Example:
 
@@ -964,7 +964,7 @@ class Qwen2_5OmniConfig(PretrainedConfig):
         thinker_config (`dict`, *optional*): Configuration of the underlying thinker sub-model.
         talker_config (`dict`, *optional*): Configuration of the underlying talker sub-model.
         token2wav_config (`dict`, *optional*): Configuration of the underlying codec sub-model.
-        enable_audio_output (`bool`, *optional*, defaults to `True`): Whether enabel audio output and load talker and token2wav module.
+        enable_audio_output (`bool`, *optional*, defaults to `True`): Whether enable audio output and load talker and token2wav module.
 
     Example:
 
@@ -1039,7 +1039,7 @@ class Qwen2_5OmniConfig(PretrainedConfig):
             decoder (`Optional[bool]`, *optional*, defaults to `False`):
                 If set to `True`, then only search for decoder config names.
         """
-        # Overriden for deeply nested config like Qwen2-Omni. We don't have any omni model
+        # Overridden for deeply nested config like Qwen2-Omni. We don't have any omni model
         # except for Qwen yet. This has to be generalized if more deeply nested configs are
         # added. NOTE: currently method used only by vLLM
         return self.thinker_config.get_text_config()
@@ -1095,7 +1095,7 @@ class Qwen2_5OmniPreTrainedModelForConditionalGeneration(Qwen2_5OmniPreTrainedMo
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             min_dtype (`float`):
                 The minimum value representable with the dtype `dtype`.
             cache_position (`torch.Tensor`):
@@ -1593,7 +1593,7 @@ class Qwen2_5OmniAudioFlashAttention2(Qwen2_5OmniAudioAttention):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
 
@@ -4179,10 +4179,10 @@ class Qwen2_5OmniForConditionalGeneration(Qwen2_5OmniPreTrainedModel, Generation
                 - **Audio waveform** (`torch.Tensor`): Generated audio waveform.
         """
         if speaker not in self.speaker_map:
-            raise ValueError(f"{speaker} is not availible, availible speakers: {self.speaker_map.keys()}")
+            raise ValueError(f"{speaker} is not available, available speakers: {self.speaker_map.keys()}")
         if return_audio and not self.has_talker:
             raise ValueError(
-                "Cannot use talker when talker module not initalized. Use `enable_talker` method or set enable_talker in config to enable talker."
+                "Cannot use talker when talker module not initialized. Use `enable_talker` method or set enable_talker in config to enable talker."
             )
         if return_audio is None:
             return_audio = self.has_talker
diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
index 8c494fdc6d..c90c90c72b 100644
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
@@ -209,7 +209,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
             image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                 Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
             patch_size (`int`, *optional*, defaults to `self.patch_size`):
-                The spacial patch size of the vision encoder.
+                The spatial patch size of the vision encoder.
             temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
                 The temporal patch size of the vision encoder.
             merge_size (`int`, *optional*, defaults to `self.merge_size`):
@@ -352,7 +352,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
             max_pixels (`int`, *optional*, defaults to `self.max_pixels`):
                 The max pixels of the image to resize the image.
             patch_size (`int`, *optional*, defaults to `self.patch_size`):
-                The spacial patch size of the vision encoder.
+                The spatial patch size of the vision encoder.
             temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
                 The temporal patch size of the vision encoder.
             merge_size (`int`, *optional*, defaults to `self.merge_size`):
diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
index 60b62449d3..d44ea279cb 100644
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
@@ -169,7 +169,7 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
             image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                 Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
             patch_size (`int`, *optional*, defaults to `self.patch_size`):
-                The spacial patch size of the vision encoder.
+                The spatial patch size of the vision encoder.
             temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
                 The temporal patch size of the vision encoder.
             merge_size (`int`, *optional*, defaults to `self.merge_size`):
@@ -306,7 +306,7 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
             max_pixels (`int`, *optional*, defaults to `self.max_pixels`):
                 The max pixels of the image to resize the image.
             patch_size (`int`, *optional*, defaults to `self.patch_size`):
-                The spacial patch size of the vision encoder.
+                The spatial patch size of the vision encoder.
             temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
                 The temporal patch size of the vision encoder.
             merge_size (`int`, *optional*, defaults to `self.merge_size`):
diff --git a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py
index 0997bbd776..bac47bb85f 100644
--- a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py
+++ b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py
@@ -124,7 +124,7 @@ class Qwen3MoeConfig(PretrainedConfig):
         norm_topk_prob (`bool`, *optional*, defaults to `False`):
             Whether to normalize the topk probabilities.
         output_router_logits (`bool`, *optional*, defaults to `False`):
-            Whether or not the router logits should be returned by the model. Enabeling this will also
+            Whether or not the router logits should be returned by the model. Enabling this will also
             allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
         router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
             The aux loss factor for the total loss.
diff --git a/src/transformers/models/rag/configuration_rag.py b/src/transformers/models/rag/configuration_rag.py
index fd0c9bb9cc..dca4eb04d3 100644
--- a/src/transformers/models/rag/configuration_rag.py
+++ b/src/transformers/models/rag/configuration_rag.py
@@ -126,7 +126,7 @@ class RagConfig(PretrainedConfig):
         )
         if "question_encoder" not in kwargs or "generator" not in kwargs:
             raise ValueError(
-                f"A configuraton of type {self.model_type} cannot be instantiated because "
+                f"A configuration of type {self.model_type} cannot be instantiated because "
                 f"both `question_encoder` and `generator` sub-configurations were not passed, only {kwargs}"
             )
         question_encoder_config = kwargs.pop("question_encoder")
diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py
index 9c670683c9..79fa08c6c5 100644
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -1186,7 +1186,7 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
                 raise ValueError(
                     "Beam search decoding cannot return more sequences than it has beams. Please set num_beams >="
                     f" num_return_sequences, got {generation_config.num_beams} and"
-                    f" {generation_config.num_return_sequences} (respectivelly)"
+                    f" {generation_config.num_return_sequences} (respectively)"
                 )
 
             def unflatten_beam_dim(tensor):
diff --git a/src/transformers/models/regnet/convert_regnet_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_to_pytorch.py
index 9544400416..ab8b34500d 100644
--- a/src/transformers/models/regnet/convert_regnet_to_pytorch.py
+++ b/src/transformers/models/regnet/convert_regnet_to_pytorch.py
@@ -90,7 +90,7 @@ class ModuleTransfer:
         for dest_m, src_m in zip(dest_traced, src_traced):
             dest_m.load_state_dict(src_m.state_dict())
             if self.verbose == 1:
-                print(f"Transfered from={src_m} to={dest_m}")
+                print(f"Transferred from={src_m} to={dest_m}")
 
 
 class FakeRegNetVisslWrapper(nn.Module):
diff --git a/src/transformers/models/regnet/modeling_flax_regnet.py b/src/transformers/models/regnet/modeling_flax_regnet.py
index f575e33db9..4a4e0a424a 100644
--- a/src/transformers/models/regnet/modeling_flax_regnet.py
+++ b/src/transformers/models/regnet/modeling_flax_regnet.py
@@ -623,7 +623,7 @@ class FlaxRegNetPreTrainedModel(FlaxPreTrainedModel):
             output_hidden_states,
             return_dict,
             rngs=rngs,
-            mutable=["batch_stats"] if train else False,  # Returing tuple with batch_stats only when train is True
+            mutable=["batch_stats"] if train else False,  # Returning tuple with batch_stats only when train is True
         )
 
 
diff --git a/src/transformers/models/resnet/convert_resnet_to_pytorch.py b/src/transformers/models/resnet/convert_resnet_to_pytorch.py
index 4909f1dc67..7552dbaefb 100644
--- a/src/transformers/models/resnet/convert_resnet_to_pytorch.py
+++ b/src/transformers/models/resnet/convert_resnet_to_pytorch.py
@@ -87,7 +87,7 @@ class ModuleTransfer:
         for dest_m, src_m in zip(dest_traced, src_traced):
             dest_m.load_state_dict(src_m.state_dict())
             if self.verbose == 1:
-                print(f"Transfered from={src_m} to={dest_m}")
+                print(f"Transferred from={src_m} to={dest_m}")
 
 
 def convert_weight_and_push(name: str, config: ResNetConfig, save_directory: Path, push_to_hub: bool = True):
diff --git a/src/transformers/models/resnet/modeling_flax_resnet.py b/src/transformers/models/resnet/modeling_flax_resnet.py
index d7fc2cc1be..aa6c84c0fd 100644
--- a/src/transformers/models/resnet/modeling_flax_resnet.py
+++ b/src/transformers/models/resnet/modeling_flax_resnet.py
@@ -514,7 +514,7 @@ class FlaxResNetPreTrainedModel(FlaxPreTrainedModel):
             output_hidden_states,
             return_dict,
             rngs=rngs,
-            mutable=["batch_stats"] if train else False,  # Returing tuple with batch_stats only when train is True
+            mutable=["batch_stats"] if train else False,  # Returning tuple with batch_stats only when train is True
         )
 
 
diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py
index f0f38a48e5..7f046a340f 100644
--- a/src/transformers/models/roc_bert/tokenization_roc_bert.py
+++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py
@@ -1075,7 +1075,7 @@ class RoCBertWordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
             text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/roformer/tokenization_roformer.py b/src/transformers/models/roformer/tokenization_roformer.py
index 312cecca19..6e9748c70b 100644
--- a/src/transformers/models/roformer/tokenization_roformer.py
+++ b/src/transformers/models/roformer/tokenization_roformer.py
@@ -226,7 +226,7 @@ class WordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
             text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py
index 6ecf72d0a1..f7e2071a34 100644
--- a/src/transformers/models/rt_detr/modeling_rt_detr.py
+++ b/src/transformers/models/rt_detr/modeling_rt_detr.py
@@ -1039,7 +1039,7 @@ class RTDetrPreTrainedModel(PreTrainedModel):
     _no_split_modules = [r"RTDetrHybridEncoder", r"RTDetrDecoderLayer"]
 
     def _init_weights(self, module):
-        """Initalize the weights"""
+        """Initialize the weights"""
         if isinstance(module, (RTDetrForObjectDetection, RTDetrDecoder)):
             if module.class_embed is not None:
                 for layer in module.class_embed:
diff --git a/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py b/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py
index 9505c3fdd3..1ab37c0e94 100644
--- a/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py
+++ b/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py
@@ -1313,7 +1313,7 @@ class RTDetrV2PreTrainedModel(PreTrainedModel):
     _no_split_modules = [r"RTDetrV2HybridEncoder", r"RTDetrV2DecoderLayer"]
 
     def _init_weights(self, module):
-        """Initalize the weights"""
+        """Initialize the weights"""
         if isinstance(module, (RTDetrV2ForObjectDetection, RTDetrV2Decoder)):
             if module.class_embed is not None:
                 for layer in module.class_embed:
diff --git a/src/transformers/models/rwkv/configuration_rwkv.py b/src/transformers/models/rwkv/configuration_rwkv.py
index 90c5cf7e1c..be90607c55 100644
--- a/src/transformers/models/rwkv/configuration_rwkv.py
+++ b/src/transformers/models/rwkv/configuration_rwkv.py
@@ -57,7 +57,7 @@ class RwkvConfig(PretrainedConfig):
             The id of the end of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer as
             GPTNeoX.
         rescale_every (`int`, *optional*, defaults to 6):
-            At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
+            At inference, the hidden states (and weights of the corresponding output layers) are divided by 2 every
             `rescale_every` layer. If set to 0 or a negative number, no rescale is done.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether or not to tie the word embeddings with the input token embeddings.
diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py
index 5d98674f73..9288cc1485 100644
--- a/src/transformers/models/sam/image_processing_sam.py
+++ b/src/transformers/models/sam/image_processing_sam.py
@@ -827,7 +827,7 @@ class SamImageProcessor(BaseImageProcessor):
             iou_scores (`Union[torch.Tensor, tf.Tensor]`):
                 List of IoU scores.
             original_size (`Tuple[int,int]`):
-                Size of the orginal image.
+                Size of the original image.
             cropped_box_image (`np.array`):
                 The cropped image.
             pred_iou_thresh (`float`, *optional*, defaults to 0.88):
@@ -887,7 +887,7 @@ class SamImageProcessor(BaseImageProcessor):
             iou_scores (`torch.Tensor`):
                 List of IoU scores.
             original_size (`Tuple[int,int]`):
-                Size of the orginal image.
+                Size of the original image.
             cropped_box_image (`np.array`):
                 The cropped image.
             pred_iou_thresh (`float`, *optional*, defaults to 0.88):
@@ -939,7 +939,7 @@ class SamImageProcessor(BaseImageProcessor):
         converted_boxes = converted_boxes[keep_mask]
 
         masks = _pad_masks(masks, cropped_box_image, original_height, original_width)
-        # conversion to rle is necessary to run non-maximum suppresion
+        # conversion to rle is necessary to run non-maximum suppression
         masks = _mask_to_rle_pytorch(masks)
 
         return masks, scores, converted_boxes
@@ -967,7 +967,7 @@ class SamImageProcessor(BaseImageProcessor):
             iou_scores (`tf.Tensor`):
                 List of IoU scores.
             original_size (`Tuple[int,int]`):
-                Size of the orginal image.
+                Size of the original image.
             cropped_box_image (`np.array`):
                 The cropped image.
             pred_iou_thresh (`float`, *optional*, defaults to 0.88):
@@ -1016,7 +1016,7 @@ class SamImageProcessor(BaseImageProcessor):
         converted_boxes = converted_boxes[keep_mask]
 
         masks = _pad_masks_tf(masks, cropped_box_image, original_height, original_width)
-        # conversion to rle is necessary to run non-maximum suppresion
+        # conversion to rle is necessary to run non-maximum suppression
         masks = _mask_to_rle_tf(masks)
 
         return masks, scores, converted_boxes
@@ -1024,7 +1024,7 @@ class SamImageProcessor(BaseImageProcessor):
 
 def _compute_stability_score_pt(masks: "torch.Tensor", mask_threshold: float, stability_score_offset: int):
     # One mask is always contained inside the other.
-    # Save memory by preventing unnecesary cast to torch.int64
+    # Save memory by preventing unnecessary cast to torch.int64
     intersections = (
         (masks > (mask_threshold + stability_score_offset)).sum(-1, dtype=torch.int16).sum(-1, dtype=torch.int32)
     )
diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py
index 549eb8a317..c81b351c8c 100644
--- a/src/transformers/models/sam/modeling_sam.py
+++ b/src/transformers/models/sam/modeling_sam.py
@@ -690,8 +690,8 @@ class SamPromptEncoder(nn.Module):
         # torch.where and expanding the labels tensor is required by the ONNX export
         point_embedding = torch.where(labels[..., None] == -1, self.not_a_point_embed.weight, point_embedding)
 
-        # This is required for the ONNX export. The dtype, device need to be explicitely
-        # specificed as otherwise torch.onnx.export interprets as double
+        # This is required for the ONNX export. The dtype, device need to be explicitly
+        # specified as otherwise torch.onnx.export interprets as double
         point_embedding = torch.where(
             labels[..., None] != -10,
             point_embedding,
@@ -1250,7 +1250,7 @@ SAM_INPUTS_DOCSTRING = r"""
             Input boxes for the points, this is used by the prompt encoder to encode the prompt. Generally yields to
             much better generated masks. The boxes can be obtained by passing a list of list of list to the processor,
             that will generate a `torch` tensor, with each dimension corresponding respectively to the image batch
-            size, the number of boxes per image and the coordinates of the top left and botton right point of the box.
+            size, the number of boxes per image and the coordinates of the top left and bottom right point of the box.
             In the order (`x1`, `y1`, `x2`, `y2`):
 
             - `x1`: the x coordinate of the top left point of the input box
diff --git a/src/transformers/models/sam/modeling_tf_sam.py b/src/transformers/models/sam/modeling_tf_sam.py
index cbe06e0542..fcfb7c2184 100644
--- a/src/transformers/models/sam/modeling_tf_sam.py
+++ b/src/transformers/models/sam/modeling_tf_sam.py
@@ -1374,7 +1374,7 @@ SAM_INPUTS_DOCSTRING = r"""
             Input boxes for the points, this is used by the prompt encoder to encode the prompt. Generally yields to
             much better generated masks. The boxes can be obtained by passing a list of list of list to the processor,
             that will generate a `tf` tensor, with each dimension corresponding respectively to the image batch size,
-            the number of boxes per image and the coordinates of the top left and botton right point of the box. In the
+            the number of boxes per image and the coordinates of the top left and bottom right point of the box. In the
             order (`x1`, `y1`, `x2`, `y2`):
 
             - `x1`: the x coordinate of the top left point of the input box
diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py
index 0b30261fa9..ff6a4cd763 100644
--- a/src/transformers/models/sam/processing_sam.py
+++ b/src/transformers/models/sam/processing_sam.py
@@ -113,7 +113,7 @@ class SamProcessor(ProcessorMixin):
             **output_kwargs["images_kwargs"],
         )
 
-        # pop arguments that are not used in the foward but used nevertheless
+        # pop arguments that are not used in the forward but used nevertheless
         original_sizes = encoding_image_processor["original_sizes"]
 
         if hasattr(original_sizes, "numpy"):  # Checks if Torch or TF tensor
diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
index fa7c75844f..8e179463d1 100755
--- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
@@ -100,7 +100,7 @@ SEAMLESS_M4T_INPUTS_DOCSTRING_FIRST_PART = r"""
 
             [What are input IDs?](../glossary#input-ids)
         input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
-            Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+            Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
             [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
     """
 
@@ -118,7 +118,7 @@ SEAMLESS_M4T_INPUTS_DOCSTRING_TEXT_PART = r"""
 SEAMLESS_M4T_INPUTS_DOCSTRING_SPEECH_PART = r"""
     Args:
         input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
-            Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+            Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
             [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
         """
 
@@ -960,7 +960,7 @@ class SeamlessM4TConformerAdapterLayer(nn.Module):
 
         # The rest of the computation is identical to a vanilla Transformer
         # encoder layer.
-        hidden_states, attn_weigths = self.self_attn(
+        hidden_states, attn_weights = self.self_attn(
             hidden_states,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
@@ -3084,7 +3084,7 @@ class SeamlessM4TForSpeechToText(SeamlessM4TPreTrainedModel, GenerationMixin):
 
         Parameters:
             input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
-                Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+                Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
                 [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
 
             tgt_lang (`str`, *optional*):
@@ -3694,7 +3694,7 @@ class SeamlessM4TForSpeechToSpeech(SeamlessM4TPreTrainedModel, GenerationMixin):
 
         Args:
             input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
-                Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+                Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
                 [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
             return_intermediate_token_ids (`bool`, *optional*):
                 If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
@@ -4103,7 +4103,7 @@ class SeamlessM4TModel(SeamlessM4TPreTrainedModel, GenerationMixin):
 
                 [What are input IDs?](../glossary#input-ids)
             input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`, *optional*):
-                Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+                Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
                 [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
             return_intermediate_token_ids (`bool`, *optional*):
                 If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
index 998008bbf5..9f7482b8a2 100644
--- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
+++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
@@ -906,7 +906,7 @@ class SeamlessM4Tv2ConformerAdapterLayer(nn.Module):
 
         # The rest of the computation is identical to a vanilla Transformer
         # encoder layer.
-        hidden_states, attn_weigths = self.self_attn(
+        hidden_states, attn_weights = self.self_attn(
             hidden_states,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
@@ -3374,7 +3374,7 @@ class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin
 
         Parameters:
             input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
-                Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+                Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
                 [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
 
             tgt_lang (`str`, *optional*):
diff --git a/src/transformers/models/sew/configuration_sew.py b/src/transformers/models/sew/configuration_sew.py
index bab5073be5..aff4e37542 100644
--- a/src/transformers/models/sew/configuration_sew.py
+++ b/src/transformers/models/sew/configuration_sew.py
@@ -99,7 +99,7 @@ class SEWConfig(PretrainedConfig):
             Recognition](https://arxiv.org/abs/1904.08779).
         mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            procedure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
             reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
             masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
             actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
@@ -111,7 +111,7 @@ class SEWConfig(PretrainedConfig):
             mask_time_min_masks''
         mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            masking procedure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
             the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
             span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
             may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
diff --git a/src/transformers/models/sew_d/configuration_sew_d.py b/src/transformers/models/sew_d/configuration_sew_d.py
index 515b6adc37..b965c85662 100644
--- a/src/transformers/models/sew_d/configuration_sew_d.py
+++ b/src/transformers/models/sew_d/configuration_sew_d.py
@@ -112,7 +112,7 @@ class SEWDConfig(PretrainedConfig):
             Recognition](https://arxiv.org/abs/1904.08779).
         mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            procedure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
             reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
             masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
             actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
@@ -124,7 +124,7 @@ class SEWDConfig(PretrainedConfig):
             mask_time_min_masks''
         mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            masking procedure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
             the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
             span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
             may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
diff --git a/src/transformers/models/shieldgemma2/processing_shieldgemma2.py b/src/transformers/models/shieldgemma2/processing_shieldgemma2.py
index 097b83ef6e..87fbcb66c1 100644
--- a/src/transformers/models/shieldgemma2/processing_shieldgemma2.py
+++ b/src/transformers/models/shieldgemma2/processing_shieldgemma2.py
@@ -105,7 +105,7 @@ class ShieldGemma2Processor(Gemma3Processor):
             text: Not supported.
             videos: Not supported.
             audio: Not supported.
-            kwargs: An optional dictionary of keyword arguments to configre the
+            kwargs: An optional dictionary of keyword arguments to configure the
                 processor. Possible values include:
 
                 *   `custom_policies`: Additional policy definitions that augment the `self.policy_definitions` passed
@@ -116,7 +116,7 @@ class ShieldGemma2Processor(Gemma3Processor):
                     generated for every key in the joint dictionary.
 
         Returns:
-            A `BatchFeature` continaing `input_ids`, `pixel_values`, etc. where each Tensor is of shape
+            A `BatchFeature` containing `input_ids`, `pixel_values`, etc. where each Tensor is of shape
             `(len(images) * len(policies), )`, and the order within the batch will be
             img1_policy1, ... img1_policyN, ... imgM_policyN.
         """
diff --git a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
index 14dfab7eaa..63e161fcbe 100644
--- a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
@@ -78,7 +78,7 @@ class SpeechEncoderDecoderConfig(PretrainedConfig):
         super().__init__(**kwargs)
         if "encoder" not in kwargs or "decoder" not in kwargs:
             raise ValueError(
-                f"A configuraton of type {self.model_type} cannot be instantiated because not both `encoder` and"
+                f"A configuration of type {self.model_type} cannot be instantiated because not both `encoder` and"
                 f" `decoder` sub-configurations are passed, but only {kwargs}"
             )
 
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
index fd837146d5..772b1d23bd 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
@@ -807,7 +807,7 @@ class FlaxSpeechEncoderDecoderModel(FlaxPreTrainedModel):
                       [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
 
             model_args (remaining positional arguments, *optional*):
-                All remaning positional arguments will be passed to the underlying model's `__init__` method.
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
 
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
index 425c3f7d5b..9cab18104e 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@@ -317,7 +317,7 @@ class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin):
                       PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
             model_args (remaining positional arguments, *optional*):
-                All remaning positional arguments will be passed to the underlying model's `__init__` method.
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
 
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index fd52e64a10..b1a4d2065d 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -547,7 +547,7 @@ class Speech2TextPreTrainedModel(PreTrainedModel):
 
     def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
         # generate creates 3D attention mask, because of the shape of input_features
-        # convert it to 2D if thats the case
+        # convert it to 2D if that's the case
         if len(attention_mask.shape) > 2:
             attention_mask = attention_mask[:, :, -1]
 
diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
index 7a20e46227..858079f5b6 100755
--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -815,7 +815,7 @@ class TFSpeech2TextEncoder(keras.layers.Layer):
 
     def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
         # generate creates 3D attention mask, because of the shape of input_features
-        # convert it to 2D if thats the case
+        # convert it to 2D if that's the case
         if len(attention_mask.shape) > 2:
             attention_mask = attention_mask[:, :, -1]
 
diff --git a/src/transformers/models/speecht5/configuration_speecht5.py b/src/transformers/models/speecht5/configuration_speecht5.py
index 2025805983..7af0dc222e 100644
--- a/src/transformers/models/speecht5/configuration_speecht5.py
+++ b/src/transformers/models/speecht5/configuration_speecht5.py
@@ -108,7 +108,7 @@ class SpeechT5Config(PretrainedConfig):
             Recognition](https://arxiv.org/abs/1904.08779).
         mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            procedure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
             reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
             masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
             actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
@@ -120,7 +120,7 @@ class SpeechT5Config(PretrainedConfig):
             mask_time_min_masks''
         mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            masking procedure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
             the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
             span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
             may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
diff --git a/src/transformers/models/splinter/tokenization_splinter.py b/src/transformers/models/splinter/tokenization_splinter.py
index 76b7543eb2..67243d4ee4 100644
--- a/src/transformers/models/splinter/tokenization_splinter.py
+++ b/src/transformers/models/splinter/tokenization_splinter.py
@@ -460,7 +460,7 @@ class WordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
           text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/squeezebert/tokenization_squeezebert.py b/src/transformers/models/squeezebert/tokenization_squeezebert.py
index 71194f3d8d..e3a73917f3 100644
--- a/src/transformers/models/squeezebert/tokenization_squeezebert.py
+++ b/src/transformers/models/squeezebert/tokenization_squeezebert.py
@@ -464,7 +464,7 @@ class WordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
             text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index 25246f3382..3c25697c39 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -141,7 +141,7 @@ class TableTransformerObjectDetectionOutput(ModelOutput):
             possible padding). You can use [`~TableTransformerImageProcessor.post_process_object_detection`] to retrieve the
             unnormalized bounding boxes.
         auxiliary_outputs (`list[Dict]`, *optional*):
-            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
             `pred_boxes`) for each decoder layer.
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
@@ -1223,8 +1223,8 @@ class TableTransformerModel(TableTransformerPreTrainedModel):
         flattened_mask = mask.flatten(1)
 
         # Fourth, sent flattened_features + flattened_mask + object queries through encoder
-        # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
-        # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+        # flattened_features is a Tensor of shape (batch_size, height*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, height*width)
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
                 inputs_embeds=flattened_features,
diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index b290b4990d..c6a9fac40c 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -2162,7 +2162,7 @@ class WordpieceTokenizer:
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
 
         Args:
             text: A single token or whitespace separated tokens. This should have
diff --git a/src/transformers/models/tvp/modeling_tvp.py b/src/transformers/models/tvp/modeling_tvp.py
index 9e93e6e900..91a0fc10b1 100644
--- a/src/transformers/models/tvp/modeling_tvp.py
+++ b/src/transformers/models/tvp/modeling_tvp.py
@@ -251,7 +251,7 @@ class TvpVisualInputEmbedding(nn.Module):
         # (batch_size, height, width, hidden_dim)
         positional_embeddings = row_position_embeddings + col_position_embeddings
 
-        # This interpolation gets triggered ONLY when the input image dim is larger in any dimenstion than the original position embeddings
+        # This interpolation gets triggered ONLY when the input image dim is larger in any dimension than the original position embeddings
         if interpolate_pos_encoding and (
             height > self.max_grid_row_position_embeddings or width > self.max_grid_col_position_embeddings
         ):
diff --git a/src/transformers/models/unispeech/configuration_unispeech.py b/src/transformers/models/unispeech/configuration_unispeech.py
index 549767c0fb..6a5d2ea401 100644
--- a/src/transformers/models/unispeech/configuration_unispeech.py
+++ b/src/transformers/models/unispeech/configuration_unispeech.py
@@ -105,7 +105,7 @@ class UniSpeechConfig(PretrainedConfig):
             Recognition](https://arxiv.org/abs/1904.08779).
         mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            procedure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
             reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
             masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
             actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
@@ -117,7 +117,7 @@ class UniSpeechConfig(PretrainedConfig):
             mask_time_min_masks''
         mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            masking procedure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
             the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
             span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
             may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
diff --git a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
index 6e736960bc..de81639d62 100644
--- a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
@@ -106,7 +106,7 @@ class UniSpeechSatConfig(PretrainedConfig):
             Recognition](https://arxiv.org/abs/1904.08779).
         mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            procedure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
             reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
             masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
             actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
@@ -118,7 +118,7 @@ class UniSpeechSatConfig(PretrainedConfig):
             mask_time_min_masks''
         mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            masking procedure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
             the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
             span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
             may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index a49bb1b225..50b0304eb5 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -46,7 +46,7 @@ class VideoLlavaProcessor(ProcessorMixin):
             Patch size from the vision tower.
         vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
             The feature selection strategy used to select the vision feature from the vision backbone.
-            Shoudl be same as in model's config
+            Should be same as in model's config
         image_token (`str`, *optional*, defaults to `"<image>"`):
             Special token used to denote image location.
         video_token (`str`, *optional*, defaults to `"<video>"`):
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
index 5bb927f2e1..659afb976c 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
@@ -749,7 +749,7 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
                       [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
 
             model_args (remaining positional arguments, *optional*):
-                All remaning positional arguments will be passed to the underlying model's `__init__` method.
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
 
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
index b909d13665..29d855728a 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
@@ -340,7 +340,7 @@ class TFVisionEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLos
                       `decoder_from_pt` should be set to `True`.
 
             model_args (remaining positional arguments, *optional*):
-                All remaning positional arguments will be passed to the underlying model's `__init__` method.
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
 
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
index 7451973b5b..69f95ff056 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
@@ -410,7 +410,7 @@ class VisionEncoderDecoderModel(PreTrainedModel, GenerationMixin):
                       PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
             model_args (remaining positional arguments, *optional*):
-                All remaning positional arguments will be passed to the underlying model's `__init__` method.
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
 
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
index b12327d8ca..57afc18988 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
@@ -41,7 +41,7 @@ VISION_TEXT_DUAL_ENCODER_START_DOCSTRING = r"""
     should be fine-tuned on a downstream task, like contrastive image-text modeling.
 
     In [LiT: Zero-Shot Transfer with Locked-image Text Tuning](https://arxiv.org/abs/2111.07991) it is shown how
-    leveraging pre-trained (locked/frozen) image and text model for contrastive learning yields significant improvment
+    leveraging pre-trained (locked/frozen) image and text model for contrastive learning yields significant improvement
     on new zero-shot vision tasks such as image classification or retrieval.
 
     After such a Vision-Text-Dual-Encoder model has been trained/fine-tuned, it can be saved/loaded just like any other
@@ -444,7 +444,7 @@ class FlaxVisionTextDualEncoderModel(FlaxPreTrainedModel):
                       conversion scripts and loading the Flax model afterwards.
 
             model_args (remaining positional arguments, *optional*):
-                All remaning positional arguments will be passed to the underlying model's `__init__` method.
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
 
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
index ca88d2fec9..6ce382a74b 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
@@ -48,7 +48,7 @@ VISION_TEXT_DUAL_ENCODER_START_DOCSTRING = r"""
     should be fine-tuned on a downstream task, like contrastive image-text modeling.
 
     In [LiT: Zero-Shot Transfer with Locked-image Text Tuning](https://arxiv.org/abs/2111.07991) it is shown how
-    leveraging pre-trained (locked/frozen) image and text model for contrastive learning yields significant improvment
+    leveraging pre-trained (locked/frozen) image and text model for contrastive learning yields significant improvement
     on new zero-shot vision tasks such as image classification or retrieval.
 
     After such a Vision-Text-Dual-Encoder model has been trained/fine-tuned, it can be saved/loaded just like any other
@@ -491,7 +491,7 @@ class TFVisionTextDualEncoderModel(TFPreTrainedModel):
                       should be set to `True` and a configuration object should be provided as `config` argument.
 
             model_args (remaining positional arguments, *optional*):
-                All remaning positional arguments will be passed to the underlying model's `__init__` method.
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
 
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
index 3e770f6935..7b2e262e87 100755
--- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
@@ -38,7 +38,7 @@ VISION_TEXT_DUAL_ENCODER_START_DOCSTRING = r"""
     should be fine-tuned on a downstream task, like contrastive image-text modeling.
 
     In [LiT: Zero-Shot Transfer with Locked-image Text Tuning](https://arxiv.org/abs/2111.07991) it is shown how
-    leveraging pre-trained (locked/frozen) image and text model for contrastive learning yields significant improvment
+    leveraging pre-trained (locked/frozen) image and text model for contrastive learning yields significant improvement
     on new zero-shot vision tasks such as image classification or retrieval.
 
     After such a Vision-Text-Dual-Encoder model has been trained/fine-tuned, it can be saved/loaded just like any other
@@ -440,7 +440,7 @@ class VisionTextDualEncoderModel(PreTrainedModel):
                       conversion scripts and loading the Flax model afterwards.
 
             model_args (remaining positional arguments, *optional*):
-                All remaning positional arguments will be passed to the underlying model's `__init__` method.
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
 
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
diff --git a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
index 8879a8665f..822f0c4041 100644
--- a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
+++ b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
@@ -1055,7 +1055,7 @@ class TFViTMAEDecoder(keras.layers.Layer):
 
     def interpolate_pos_encoding(self, embeddings) -> tf.Tensor:
         """
-        This method is a modified version of the interpolation function for ViT-mae model at the deocder, that
+        This method is a modified version of the interpolation function for ViT-mae model at the decoder, that
         allows to interpolate the pre-trained decoder position encodings, to be able to use the model on higher
         resolution images.
 
diff --git a/src/transformers/models/wav2vec2/configuration_wav2vec2.py b/src/transformers/models/wav2vec2/configuration_wav2vec2.py
index 3b717f6af9..b1702ae22e 100644
--- a/src/transformers/models/wav2vec2/configuration_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/configuration_wav2vec2.py
@@ -105,7 +105,7 @@ class Wav2Vec2Config(PretrainedConfig):
             Recognition](https://arxiv.org/abs/1904.08779).
         mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            procedure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
             reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
             masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
             actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
@@ -117,7 +117,7 @@ class Wav2Vec2Config(PretrainedConfig):
             mask_time_min_masks''
         mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            masking procedure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
             the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
             span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
             may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 4904030606..46c5dd790a 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -1993,7 +1993,7 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):
         extract_features = self.dropout_features(outputs[1])
 
         if attention_mask is not None:
-            # compute reduced attention_mask correponding to feature vectors
+            # compute reduced attention_mask corresponding to feature vectors
             attention_mask = self._get_feature_vector_attention_mask(
                 extract_features.shape[1], attention_mask, add_adapter=False
             )
diff --git a/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py
index cff7c5cce3..c4af932213 100644
--- a/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py
+++ b/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py
@@ -75,7 +75,7 @@ class Wav2Vec2BertConfig(PretrainedConfig):
             Recognition](https://arxiv.org/abs/1904.08779).
         mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            procecure generates `mask_time_prob*len(time_axis)/mask_time_length ``independent masks over the axis. If
+            procedure generates `mask_time_prob*len(time_axis)/mask_time_length ``independent masks over the axis. If
             reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
             masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
             actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
@@ -87,7 +87,7 @@ class Wav2Vec2BertConfig(PretrainedConfig):
             mask_time_min_masks`.
         mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            masking procecure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over
+            masking procedure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over
             the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
             span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
             may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
diff --git a/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py b/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py
index 33510654dc..780dedd8ac 100644
--- a/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py
+++ b/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py
@@ -104,7 +104,7 @@ def _convert_model(
             state_dict[new_key] = state_dict.pop(k)
 
     extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
-    extra_keys = set({k for k in extra_keys if "num_updates" not in k})  # filter unecessary param
+    extra_keys = set({k for k in extra_keys if "num_updates" not in k})  # filter unnecessary param
     missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
     if len(extra_keys) != 0:
         raise ValueError(f"extra keys found: {extra_keys}")
diff --git a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
index d3d393f9ba..27c809ac5a 100644
--- a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
+++ b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
@@ -707,7 +707,7 @@ class Wav2Vec2BertAdapterLayer(nn.Module):
 
         # The rest of the computation is identical to a vanilla Transformer
         # encoder layer.
-        hidden_states, attn_weigths = self.self_attn(
+        hidden_states, attn_weights = self.self_attn(
             hidden_states,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
diff --git a/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py
index cc1df08aa2..8447bde849 100644
--- a/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py
+++ b/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py
@@ -582,7 +582,7 @@ class Wav2Vec2BertAdapterLayer(nn.Module):
 
         # The rest of the computation is identical to a vanilla Transformer
         # encoder layer.
-        hidden_states, attn_weigths = self.self_attn(
+        hidden_states, attn_weights = self.self_attn(
             hidden_states,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
diff --git a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
index d324614e76..d33adb90d6 100644
--- a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
@@ -102,7 +102,7 @@ class Wav2Vec2ConformerConfig(PretrainedConfig):
             Recognition](https://arxiv.org/abs/1904.08779).
         mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            procedure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
             reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
             masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
             actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
@@ -114,7 +114,7 @@ class Wav2Vec2ConformerConfig(PretrainedConfig):
             mask_time_min_masks''
         mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            masking procedure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
             the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
             span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
             may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
index bd94e44b61..e5a0f2d87b 100644
--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -1437,7 +1437,7 @@ class Wav2Vec2ConformerForPreTraining(Wav2Vec2ConformerPreTrainedModel):
         extract_features = self.dropout_features(outputs[1])
 
         if attention_mask is not None:
-            # compute reduced attention_mask correponding to feature vectors
+            # compute reduced attention_mask corresponding to feature vectors
             attention_mask = self._get_feature_vector_attention_mask(
                 extract_features.shape[1], attention_mask, add_adapter=False
             )
diff --git a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
index 574de200fa..9f60bf346a 100644
--- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
+++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
@@ -69,7 +69,7 @@ class Wav2Vec2PhonemeCTCTokenizerOutput(ModelOutput):
             Decoded logits in text from. Usually the speech transcription.
         char_offsets (list of `List[Dict[str, Union[int, str]]]` or `List[Dict[str, Union[int, str]]]`):
             Offsets of the decoded characters. In combination with sampling rate and model downsampling rate char
-            offsets can be used to compute time stamps for each charater. Total logit score of the beam associated with
+            offsets can be used to compute time stamps for each character. Total logit score of the beam associated with
             produced text.
     """
 
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 7706819ab4..8ad742558e 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -131,7 +131,7 @@ class WhisperConfig(PretrainedConfig):
             function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI each correspond to the `english-only` and the
             `multilingual` model.
         begin_suppress_tokens (`List[int]`, *optional*, defaults to `[220,50256]`):
-            A list containing tokens that will be supressed at the beginning of the sampling process. Initialized as
+            A list containing tokens that will be suppressed at the beginning of the sampling process. Initialized as
             the token for `" "` (`blank_token_id`) and the `eos_token_id`
         use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
             Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
@@ -145,7 +145,7 @@ class WhisperConfig(PretrainedConfig):
             Recognition](https://arxiv.org/abs/1904.08779).
         mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            procecure generates `mask_time_prob*len(time_axis)/mask_time_length` independent masks over the axis. If
+            procedure generates `mask_time_prob*len(time_axis)/mask_time_length` independent masks over the axis. If
             reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
             masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
             actual percentage of masked vectors. This is only relevant if `apply_spec_augment == True`.
@@ -157,7 +157,7 @@ class WhisperConfig(PretrainedConfig):
             mask_time_min_masks''
         mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            masking procecure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over
+            masking procedure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over
             the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
             span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
             may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index af21fb7d3e..575bfb1c54 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -49,9 +49,9 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
         sampling_rate (`int`, *optional*, defaults to 16000):
             The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
         hop_length (`int`, *optional*, defaults to 160):
-            Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
+            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
         chunk_length (`int`, *optional*, defaults to 30):
-            The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio
+            The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
             sequences.
         n_fft (`int`, *optional*, defaults to 400):
             Size of the Fourier transform.
diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 50e3db09a4..da1d83b2a8 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -253,7 +253,7 @@ class WhisperGenerationMixin(GenerationMixin):
             # beam search takes `decoder_input_ids` into account in the `beam_indices` length
             # but forgot to shift the beam_indices by the number of `decoder_input_ids`
             beam_indices = torch.zeros_like(generate_outputs.beam_indices[:, :weight_length])
-            # we actually shif the beam indices here
+            # we actually shift the beam indices here
             beam_indices[:, num_input_ids:] = generate_outputs.beam_indices[:, : weight_length - num_input_ids]
 
             weights = weights[:, :, :weight_length]
@@ -294,7 +294,7 @@ class WhisperGenerationMixin(GenerationMixin):
                 weights = weights[..., : num_frames[0] // 2]
 
             else:
-                # num_frames is of shape (batch_size,) whereas batch_size is truely batch_size*num_return_sequences
+                # num_frames is of shape (batch_size,) whereas batch_size is truly batch_size*num_return_sequences
                 repeat_time = batch_size if isinstance(num_frames, int) else batch_size // len(num_frames)
                 num_frames = num_frames.cpu() if isinstance(num_frames, (torch.Tensor)) else num_frames
                 num_frames = np.repeat(num_frames, repeat_time)
@@ -865,7 +865,7 @@ class WhisperGenerationMixin(GenerationMixin):
 
         if return_dict_in_generate and generation_config.return_dict_in_generate:
             logger.warning_once(
-                "You have passed `return_dict_in_generate=True` and `return_timestamps=True`, this automatically sets `return_segments=True` to access the resuls of the underlying calls to GenerationMixin's generate in the returned `segments`."
+                "You have passed `return_dict_in_generate=True` and `return_timestamps=True`, this automatically sets `return_segments=True` to access the results of the underlying calls to GenerationMixin's generate in the returned `segments`."
             )
             return_segments = True
         elif not return_segments and not return_token_timestamps:
@@ -1466,7 +1466,7 @@ class WhisperGenerationMixin(GenerationMixin):
         if language is not None:
             lang_ids = [language_to_id(l) for l in languages]
         elif hasattr(generation_config, "lang_to_id") and is_lang_id_undefined:
-            # language is not defined or intentially set to `None` to trigger language detection
+            # language is not defined or intentionally set to `None` to trigger language detection
             lang_ids = self.detect_language(
                 input_features=input_features,
                 encoder_outputs=kwargs.get("encoder_outputs", None),
@@ -1554,7 +1554,7 @@ class WhisperGenerationMixin(GenerationMixin):
         if input_features is None and encoder_outputs is None:
             raise ValueError("You have to specify either `input_features` or `encoder_outputs`")
         elif input_features is not None and encoder_outputs is not None:
-            raise ValueError("Make sure to specificy only one of `input_features` or `encoder_outputs` - not both!")
+            raise ValueError("Make sure to specify only one of `input_features` or `encoder_outputs` - not both!")
         elif input_features is not None:
             inputs = {"input_features": input_features[:, :, :num_segment_frames]}
             batch_size = input_features.shape[0]
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 9e4c89b591..b713566bfc 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -515,7 +515,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
 
     def normalize(self, text):
         """
-        Normalize a given string using the `EnglishTextNormalizer` class, which preforms commons transformation on
+        Normalize a given string using the `EnglishTextNormalizer` class, which performs commons transformation on
         english text.
         """
         normalizer = EnglishTextNormalizer(self.english_spelling_normalizer)
@@ -524,7 +524,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
     @staticmethod
     def basic_normalize(text, remove_diacritics=False):
         """
-        Normalize a given string using the `BasicTextNormalizer` class, which preforms commons transformation on
+        Normalize a given string using the `BasicTextNormalizer` class, which performs commons transformation on
         multilingual text.
         """
         normalizer = BasicTextNormalizer(remove_diacritics=remove_diacritics)
@@ -1045,7 +1045,7 @@ def _decode_asr(tokenizer, model_outputs, *, return_timestamps, return_language,
                         # as a stop where it should be a start.
                         # This is an issue in the underlying model output
                         # Let's just skip it so it becomes de-factor
-                        # a start agin
+                        # a start again
                         pass
                     else:
                         chunk["timestamp"][1] = time
diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py
index 64dcc186bf..59abecaa26 100644
--- a/src/transformers/models/whisper/tokenization_whisper_fast.py
+++ b/src/transformers/models/whisper/tokenization_whisper_fast.py
@@ -420,7 +420,7 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
     # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.normalize
     def normalize(self, text):
         """
-        Normalize a given string using the `EnglishTextNormalizer` class, which preforms commons transformation on
+        Normalize a given string using the `EnglishTextNormalizer` class, which performs commons transformation on
         english text.
         """
         normalizer = EnglishTextNormalizer(self.english_spelling_normalizer)
@@ -430,7 +430,7 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
     # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.basic_normalize
     def basic_normalize(text, remove_diacritics=False):
         """
-        Normalize a given string using the `BasicTextNormalizer` class, which preforms commons transformation on
+        Normalize a given string using the `BasicTextNormalizer` class, which performs commons transformation on
         multilingual text.
         """
         normalizer = BasicTextNormalizer(remove_diacritics=remove_diacritics)
diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py
index a500e5dccd..310ca960c3 100644
--- a/src/transformers/models/x_clip/configuration_x_clip.py
+++ b/src/transformers/models/x_clip/configuration_x_clip.py
@@ -252,7 +252,7 @@ class XCLIPConfig(PretrainedConfig):
         prompt_projection_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for the projection layers in the video specific prompt generator.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
-            The inital value of the *logit_scale* parameter. Default is used as per the original XCLIP implementation.
+            The initial value of the *logit_scale* parameter. Default is used as per the original XCLIP implementation.
         kwargs (*optional*):
             Dictionary of keyword arguments.
     """
diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py
index 42bebb37f7..2705a412b7 100755
--- a/src/transformers/models/yolos/modeling_yolos.py
+++ b/src/transformers/models/yolos/modeling_yolos.py
@@ -67,7 +67,7 @@ class YolosObjectDetectionOutput(ModelOutput):
             possible padding). You can use [`~YolosImageProcessor.post_process`] to retrieve the unnormalized bounding
             boxes.
         auxiliary_outputs (`list[Dict]`, *optional*):
-            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
             `pred_boxes`) for each decoder layer.
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
@@ -155,9 +155,9 @@ class InterpolateInitialPositionEmbeddings(nn.Module):
         patch_pos_embed = patch_pos_embed.view(batch_size, hidden_size, patch_height, patch_width)
 
         height, width = img_size
-        new_patch_heigth, new_patch_width = height // self.config.patch_size, width // self.config.patch_size
+        new_patch_height, new_patch_width = height // self.config.patch_size, width // self.config.patch_size
         patch_pos_embed = nn.functional.interpolate(
-            patch_pos_embed, size=(new_patch_heigth, new_patch_width), mode="bicubic", align_corners=False
+            patch_pos_embed, size=(new_patch_height, new_patch_width), mode="bicubic", align_corners=False
         )
         patch_pos_embed = patch_pos_embed.flatten(2).transpose(1, 2)
         scale_pos_embed = torch.cat((cls_pos_embed, patch_pos_embed, det_pos_embed), dim=1)
diff --git a/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py b/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
index 950769ae1e..07652ed021 100644
--- a/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
+++ b/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
@@ -85,7 +85,7 @@ def convert_yoso_checkpoint(checkpoint_path, yoso_config_file, pytorch_dump_path
     model.eval()
     model.save_pretrained(pytorch_dump_path)
 
-    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
+    print(f"Checkpoint successfully converted. Model saved at {pytorch_dump_path}")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/zamba/modeling_zamba.py b/src/transformers/models/zamba/modeling_zamba.py
index 29c54fbdf4..31949cf6a0 100644
--- a/src/transformers/models/zamba/modeling_zamba.py
+++ b/src/transformers/models/zamba/modeling_zamba.py
@@ -1310,7 +1310,7 @@ class ZambaForCausalLM(ZambaPreTrainedModel, GenerationMixin):
         use_cache=True,
         **kwargs,
     ):
-        # Overwitten -- has a unique cache type, `ZambaHybridDynamicCache`
+        # Overwritten -- has a unique cache type, `ZambaHybridDynamicCache`
 
         empty_past_kv = past_key_values is None
 
diff --git a/src/transformers/models/zamba2/modeling_zamba2.py b/src/transformers/models/zamba2/modeling_zamba2.py
index ad6ca0f872..19fc05b217 100644
--- a/src/transformers/models/zamba2/modeling_zamba2.py
+++ b/src/transformers/models/zamba2/modeling_zamba2.py
@@ -571,7 +571,7 @@ class Zamba2MambaMixer(nn.Module):
             projection_size,
             bias=config.add_bias_linear,
         )
-        # selective projection used to make dt, B and C input dependant
+        # selective projection used to make dt, B and C input dependent
 
         # time step projection (discretization)
         # instantiate once and copy inv_dt in init_weights of PretrainedModel
@@ -1719,7 +1719,7 @@ class Zamba2ForCausalLM(Zamba2PreTrainedModel, GenerationMixin):
         use_cache=True,
         **kwargs,
     ):
-        # Overwitten -- has a unique cache type, `Zamba2HybridDynamicCache`
+        # Overwritten -- has a unique cache type, `Zamba2HybridDynamicCache`
 
         empty_past_kv = past_key_values is None
 
diff --git a/src/transformers/models/zamba2/modular_zamba2.py b/src/transformers/models/zamba2/modular_zamba2.py
index cd0af82658..be4f38044c 100644
--- a/src/transformers/models/zamba2/modular_zamba2.py
+++ b/src/transformers/models/zamba2/modular_zamba2.py
@@ -341,7 +341,7 @@ class Zamba2MambaMixer(nn.Module):
             projection_size,
             bias=config.add_bias_linear,
         )
-        # selective projection used to make dt, B and C input dependant
+        # selective projection used to make dt, B and C input dependent
 
         # time step projection (discretization)
         # instantiate once and copy inv_dt in init_weights of PretrainedModel
diff --git a/src/transformers/models/zoedepth/image_processing_zoedepth.py b/src/transformers/models/zoedepth/image_processing_zoedepth.py
index d40b938403..9d87898235 100644
--- a/src/transformers/models/zoedepth/image_processing_zoedepth.py
+++ b/src/transformers/models/zoedepth/image_processing_zoedepth.py
@@ -107,10 +107,10 @@ class ZoeDepthImageProcessor(BaseImageProcessor):
         do_pad (`bool`, *optional*, defaults to `True`):
             Whether to apply pad the input.
         do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overidden by `do_rescale` in
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
             `preprocess`.
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Scale factor to use if rescaling the image. Can be overidden by `rescale_factor` in `preprocess`.
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in `preprocess`.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
             method.
@@ -121,25 +121,25 @@ class ZoeDepthImageProcessor(BaseImageProcessor):
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
         do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image's (height, width) dimensions. Can be overidden by `do_resize` in `preprocess`.
+            Whether to resize the image's (height, width) dimensions. Can be overridden by `do_resize` in `preprocess`.
         size (`Dict[str, int]` *optional*, defaults to `{"height": 384, "width": 512}`):
             Size of the image after resizing. Size of the image after resizing. If `keep_aspect_ratio` is `True`,
             the image is resized by choosing the smaller of the height and width scaling factors and using it for both dimensions.
             If `ensure_multiple_of` is also set, the image is further resized to a size that is a multiple of this value.
-            Can be overidden by `size` in `preprocess`.
+            Can be overridden by `size` in `preprocess`.
         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
-            Defines the resampling filter to use if resizing the image. Can be overidden by `resample` in `preprocess`.
+            Defines the resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
         keep_aspect_ratio (`bool`, *optional*, defaults to `True`):
             If `True`, the image is resized by choosing the smaller of the height and width scaling factors and using it
             for both dimensions. This ensures that the image is scaled down as little as possible while still fitting
             within the desired output size. In case `ensure_multiple_of` is also set, the image is further resized to a
             size that is a multiple of this value by flooring the height and width to the nearest multiple of this value.
-            Can be overidden by `keep_aspect_ratio` in `preprocess`.
+            Can be overridden by `keep_aspect_ratio` in `preprocess`.
         ensure_multiple_of (`int`, *optional*, defaults to 32):
             If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Works by flooring
             the height and width to the nearest multiple of this value.
 
-            Works both with and without `keep_aspect_ratio` being set to `True`. Can be overidden by `ensure_multiple_of`
+            Works both with and without `keep_aspect_ratio` being set to `True`. Can be overridden by `ensure_multiple_of`
             in `preprocess`.
     """
 
@@ -348,7 +348,7 @@ class ZoeDepthImageProcessor(BaseImageProcessor):
                 If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Works by
                 flooring the height and width to the nearest multiple of this value.
 
-                Works both with and without `keep_aspect_ratio` being set to `True`. Can be overidden by
+                Works both with and without `keep_aspect_ratio` being set to `True`. Can be overridden by
                 `ensure_multiple_of` in `preprocess`.
             resample (`int`, *optional*, defaults to `self.resample`):
                 Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 66a9c49ea5..bfca21e38d 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -426,7 +426,7 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
             if isinstance(stride_length_s, (int, float)):
                 stride_length_s = [stride_length_s, stride_length_s]
 
-            # XXX: Carefuly, this variable will not exist in `seq2seq` setting.
+            # XXX: Carefully, this variable will not exist in `seq2seq` setting.
             # Currently chunking is not possible at this level for `seq2seq` so
             # it's ok.
             align_to = getattr(self.model.config, "inputs_to_logits_ratio", 1)
diff --git a/src/transformers/pipelines/mask_generation.py b/src/transformers/pipelines/mask_generation.py
index f87e45b7f8..25bd611215 100644
--- a/src/transformers/pipelines/mask_generation.py
+++ b/src/transformers/pipelines/mask_generation.py
@@ -33,7 +33,7 @@ logger = logging.get_logger(__name__)
 class MaskGenerationPipeline(ChunkPipeline):
     """
     Automatic mask generation for images using `SamForMaskGeneration`. This pipeline predicts binary masks for an
-    image, given an image. It is a `ChunkPipeline` because you can seperate the points in a mini-batch in order to
+    image, given an image. It is a `ChunkPipeline` because you can separate the points in a mini-batch in order to
     avoid OOM issues. Use the `points_per_batch` argument to control the number of points that will be processed at the
     same time. Default is `64`.
 
diff --git a/src/transformers/quantizers/base.py b/src/transformers/quantizers/base.py
index d5ae46a0af..2818b9d26e 100644
--- a/src/transformers/quantizers/base.py
+++ b/src/transformers/quantizers/base.py
@@ -239,7 +239,7 @@ class HfQuantizer(ABC):
 
     def dequantize(self, model):
         """
-        Potentially dequantize the model to retrive the original model, with some loss in accuracy / performance.
+        Potentially dequantize the model to retrieve the original model, with some loss in accuracy / performance.
         Note not all quantization schemes support this.
         """
         model = self._dequantize(model)
diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py
index e5e02f5387..21dc2498d9 100644
--- a/src/transformers/quantizers/quantizer_torchao.py
+++ b/src/transformers/quantizers/quantizer_torchao.py
@@ -220,7 +220,7 @@ class TorchAoHfQuantizer(HfQuantizer):
         unexpected_keys: List[str],
     ):
         """
-        Each nn.Linear layer that needs to be quantized is processsed here.
+        Each nn.Linear layer that needs to be quantized is processed here.
         First, we set the value the weight tensor, then we move it to the target device. Finally, we quantize the module.
         """
         if self.quantization_config.quant_type == "autoquant":
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 731c02b8e0..fc82d0b255 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -359,7 +359,7 @@ class Trainer:
             Processing class used to process the data. If provided, will be used to automatically process the inputs
             for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
             reuse the fine-tuned model.
-            This supercedes the `tokenizer` argument, which is now deprecated.
+            This supersedes the `tokenizer` argument, which is now deprecated.
         model_init (`Callable[[], PreTrainedModel]`, *optional*):
             A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start
             from a new instance of the model as given by this function.
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 2eac33b7a2..7291b8f98a 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -1887,7 +1887,7 @@ class _LazyModule(ModuleType):
                 #
                 # dict_keys(['models.nllb_moe.configuration_nllb_moe', 'models.sew_d.configuration_sew_d'])
                 #
-                # with this, we don't only want to be able to import these explicitely, we want to be able to import
+                # with this, we don't only want to be able to import these explicitly, we want to be able to import
                 # every intermediate module as well. Therefore, this is what is returned:
                 #
                 # {
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 9f17b3881a..155fcd7681 100644
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -283,7 +283,7 @@ class HqqConfig(QuantizationConfigMixin):
         nbits (`int`, *optional*, defaults to 4):
             Number of bits. Supported values are (8, 4, 3, 2, 1).
         group_size (`int`, *optional*, defaults to 64):
-            Group-size value. Supported values are any value that is divisble by weight.shape[axis]).
+            Group-size value. Supported values are any value that is divisible by weight.shape[axis]).
         view_as_float (`bool`, *optional*, defaults to `False`):
             View the quantized weight as float (used in distributed training) if set to `True`.
         axis (`Optional[int]`, *optional*):
@@ -661,7 +661,7 @@ class GPTQConfig(QuantizationConfigMixin):
             Whether to quantize columns in order of decreasing activation size. Setting it to False can significantly
             speed up inference but the perplexity may become slightly worse. Also known as act-order.
         sym (`bool`, *optional*, defaults to `True`):
-            Whether to use symetric quantization.
+            Whether to use symmetric quantization.
         true_sequential (`bool`, *optional*, defaults to `True`):
             Whether to perform sequential quantization even within a single Transformer block. Instead of quantizing
             the entire block at once, we perform layer-wise quantization. As a result, each layer undergoes
@@ -1729,7 +1729,7 @@ class TorchAoConfig(QuantizationConfigMixin):
             # Handle AOBaseConfig serialization
             from torchao.core.config import config_to_dict
 
-            # For now we assume there is 1 config per Transfomer, however in the future
+            # For now we assume there is 1 config per Transformer, however in the future
             # We may want to support a config per fqn.
             d["quant_type"] = {"default": config_to_dict(self.quant_type)}
 
diff --git a/tests/models/bert/test_modeling_tf_bert.py b/tests/models/bert/test_modeling_tf_bert.py
index fe3689c9f0..b9fbdc9d43 100644
--- a/tests/models/bert/test_modeling_tf_bert.py
+++ b/tests/models/bert/test_modeling_tf_bert.py
@@ -650,7 +650,7 @@ class TFBertModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, PipelineTester
     def test_causal_lm_base_model(self):
         """Test the base model of the causal LM model
 
-        is_deocder=True, no cross_attention, no encoder outputs
+        is_decoder=True, no cross_attention, no encoder outputs
         """
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_causal_lm_base_model(*config_and_inputs)
@@ -658,7 +658,7 @@ class TFBertModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, PipelineTester
     def test_model_as_decoder(self):
         """Test the base model as a decoder (of an encoder-decoder architecture)
 
-        is_deocder=True + cross_attention + pass encoder outputs
+        is_decoder=True + cross_attention + pass encoder outputs
         """
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
diff --git a/tests/models/electra/test_modeling_tf_electra.py b/tests/models/electra/test_modeling_tf_electra.py
index 2b7e8d1a18..de9e61ea54 100644
--- a/tests/models/electra/test_modeling_tf_electra.py
+++ b/tests/models/electra/test_modeling_tf_electra.py
@@ -533,7 +533,7 @@ class TFElectraModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestC
     def test_causal_lm_base_model(self):
         """Test the base model of the causal LM model
 
-        is_deocder=True, no cross_attention, no encoder outputs
+        is_decoder=True, no cross_attention, no encoder outputs
         """
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_causal_lm_base_model(*config_and_inputs)
@@ -541,7 +541,7 @@ class TFElectraModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestC
     def test_model_as_decoder(self):
         """Test the base model as a decoder (of an encoder-decoder architecture)
 
-        is_deocder=True + cross_attention + pass encoder outputs
+        is_decoder=True + cross_attention + pass encoder outputs
         """
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
diff --git a/tests/models/esm/test_modeling_tf_esm.py b/tests/models/esm/test_modeling_tf_esm.py
index 52d163d2dc..c7478ab3c0 100644
--- a/tests/models/esm/test_modeling_tf_esm.py
+++ b/tests/models/esm/test_modeling_tf_esm.py
@@ -236,7 +236,7 @@ class TFEsmModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
     def test_model_as_decoder(self):
         """Test the base model as a decoder (of an encoder-decoder architecture)
 
-        is_deocder=True + cross_attention + pass encoder outputs
+        is_decoder=True + cross_attention + pass encoder outputs
         """
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
diff --git a/tests/models/rembert/test_modeling_tf_rembert.py b/tests/models/rembert/test_modeling_tf_rembert.py
index 551b4fcc61..b07d84d607 100644
--- a/tests/models/rembert/test_modeling_tf_rembert.py
+++ b/tests/models/rembert/test_modeling_tf_rembert.py
@@ -618,7 +618,7 @@ class TFRemBertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestC
     def test_causal_lm_base_model(self):
         """Test the base model of the causal LM model
 
-        is_deocder=True, no cross_attention, no encoder outputs
+        is_decoder=True, no cross_attention, no encoder outputs
         """
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_causal_lm_base_model(*config_and_inputs)
@@ -626,7 +626,7 @@ class TFRemBertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestC
     def test_model_as_decoder(self):
         """Test the base model as a decoder (of an encoder-decoder architecture)
 
-        is_deocder=True + cross_attention + pass encoder outputs
+        is_decoder=True + cross_attention + pass encoder outputs
         """
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
diff --git a/tests/models/roberta/test_modeling_tf_roberta.py b/tests/models/roberta/test_modeling_tf_roberta.py
index 0ee5cd03a7..d2dbc30928 100644
--- a/tests/models/roberta/test_modeling_tf_roberta.py
+++ b/tests/models/roberta/test_modeling_tf_roberta.py
@@ -592,7 +592,7 @@ class TFRobertaModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestC
     def test_causal_lm_base_model(self):
         """Test the base model of the causal LM model
 
-        is_deocder=True, no cross_attention, no encoder outputs
+        is_decoder=True, no cross_attention, no encoder outputs
         """
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_causal_lm_base_model(*config_and_inputs)
@@ -600,7 +600,7 @@ class TFRobertaModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestC
     def test_model_as_decoder(self):
         """Test the base model as a decoder (of an encoder-decoder architecture)
 
-        is_deocder=True + cross_attention + pass encoder outputs
+        is_decoder=True + cross_attention + pass encoder outputs
         """
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
diff --git a/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py
index 48dc45fca4..835a6d3e3a 100644
--- a/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py
+++ b/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py
@@ -594,7 +594,7 @@ class TFRobertaPreLayerNormModelTest(TFModelTesterMixin, PipelineTesterMixin, un
     def test_causal_lm_base_model(self):
         """Test the base model of the causal LM model
 
-        is_deocder=True, no cross_attention, no encoder outputs
+        is_decoder=True, no cross_attention, no encoder outputs
         """
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_causal_lm_base_model(*config_and_inputs)
@@ -602,7 +602,7 @@ class TFRobertaPreLayerNormModelTest(TFModelTesterMixin, PipelineTesterMixin, un
     def test_model_as_decoder(self):
         """Test the base model as a decoder (of an encoder-decoder architecture)
 
-        is_deocder=True + cross_attention + pass encoder outputs
+        is_decoder=True + cross_attention + pass encoder outputs
         """
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 4c6f9031ae..fb7c66930c 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -1103,7 +1103,7 @@ class ProcessorTesterMixin:
             signature.parameters.get("videos") is not None
             and signature.parameters["videos"].annotation == inspect._empty
         ):
-            self.skipTest(f"{self.processor_class} does not suport video inputs")
+            self.skipTest(f"{self.processor_class} does not support video inputs")
 
         if "feature_extractor" not in self.processor_class.attributes:
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")