From 214062201e85276720d86a858e6f3b745e64c6ec Mon Sep 17 00:00:00 2001 From: co63oc Date: Fri, 25 Apr 2025 20:47:25 +0800 Subject: [PATCH] Fix typos in strings and comments (#37784) * Fix typos in strings and comments * Fix --- src/transformers/generation/streamers.py | 4 +- src/transformers/generation/tf_utils.py | 2 +- src/transformers/generation/utils.py | 4 +- src/transformers/generation/watermarking.py | 2 +- src/transformers/integrations/aqlm.py | 2 +- src/transformers/integrations/awq.py | 2 +- src/transformers/integrations/bitnet.py | 2 +- src/transformers/integrations/bitsandbytes.py | 6 +-- src/transformers/integrations/eetq.py | 2 +- src/transformers/integrations/fbgemm_fp8.py | 4 +- src/transformers/integrations/ggml.py | 2 +- src/transformers/integrations/higgs.py | 2 +- .../integrations/integration_utils.py | 2 +- src/transformers/integrations/peft.py | 2 +- src/transformers/integrations/quanto.py | 2 +- .../integrations/tensor_parallel.py | 8 ++-- src/transformers/integrations/vptq.py | 2 +- src/transformers/loss/loss_rt_detr.py | 2 +- src/transformers/modeling_utils.py | 2 +- .../models/bamba/modeling_bamba.py | 4 +- .../models/bamba/modular_bamba.py | 4 +- src/transformers/models/bark/modeling_bark.py | 6 +-- ...ping_original_tf2_checkpoint_to_pytorch.py | 4 +- .../models/bert/tokenization_bert.py | 2 +- .../tokenization_bert_japanese.py | 4 +- .../models/big_bird/modeling_big_bird.py | 2 +- .../models/big_bird/modeling_flax_big_bird.py | 4 +- .../modeling_bigbird_pegasus.py | 2 +- src/transformers/models/blip/modeling_blip.py | 6 +-- .../models/blip/modeling_tf_blip.py | 6 +-- ...rt_bloom_original_checkpoint_to_pytorch.py | 8 ++-- .../camembert/tokenization_camembert.py | 2 +- .../convert_chameleon_weights_to_hf.py | 2 +- .../models/chameleon/modeling_chameleon.py | 2 +- src/transformers/models/clap/modeling_clap.py | 10 ++--- .../models/clvp/feature_extraction_clvp.py | 4 +- .../modeling_conditional_detr.py | 12 ++--- .../models/convbert/tokenization_convbert.py | 2 +- .../models/cpmant/modeling_cpmant.py | 2 +- .../models/dab_detr/modeling_dab_detr.py | 6 +-- .../data2vec/configuration_data2vec_audio.py | 4 +- .../data2vec/modeling_tf_data2vec_vision.py | 2 +- .../configuration_decision_transformer.py | 2 +- .../modeling_decision_transformer.py | 2 +- .../modeling_deformable_detr.py | 6 +-- .../models/deprecated/deta/modeling_deta.py | 8 ++-- ..._original_pytorch_checkpoint_to_pytorch.py | 4 +- ...convert_gptsan_tf_checkpoint_to_pytorch.py | 2 +- .../modeling_gptsan_japanese.py | 10 ++--- .../tokenization_gptsan_japanese.py | 4 +- .../graphormer/modeling_graphormer.py | 2 +- .../deprecated/jukebox/convert_jukebox.py | 2 +- .../deprecated/jukebox/modeling_jukebox.py | 44 +++++++++---------- .../models/deprecated/mctct/modeling_mctct.py | 2 +- .../models/deprecated/mega/modeling_mega.py | 8 ++-- .../deprecated/realm/retrieval_realm.py | 2 +- .../deprecated/realm/tokenization_realm.py | 2 +- .../retribert/tokenization_retribert.py | 2 +- .../transfo_xl/configuration_transfo_xl.py | 2 +- .../tvlt/feature_extraction_tvlt.py | 4 +- .../deprecated/tvlt/image_processing_tvlt.py | 4 +- .../deprecated/van/convert_van_to_pytorch.py | 2 +- .../convert_depth_anything_to_hf.py | 2 +- .../models/depth_pro/modeling_depth_pro.py | 4 +- src/transformers/models/detr/modeling_detr.py | 12 ++--- .../distilbert/tokenization_distilbert.py | 2 +- .../models/dpt/convert_dinov2_depth_to_hf.py | 2 +- .../models/dpt/convert_dpt_beit_to_hf.py | 2 +- .../models/dpt/image_processing_dpt.py | 14 +++--- .../efficientnet/modeling_efficientnet.py | 2 +- .../models/electra/modeling_flax_electra.py | 2 +- .../models/electra/tokenization_electra.py | 2 +- src/transformers/models/emu3/modeling_emu3.py | 2 +- src/transformers/models/emu3/modular_emu3.py | 2 +- .../models/encodec/configuration_encodec.py | 2 +- .../models/encodec/modeling_encodec.py | 2 +- .../configuration_encoder_decoder.py | 2 +- .../modeling_flax_encoder_decoder.py | 2 +- .../modeling_tf_encoder_decoder.py | 2 +- .../models/flava/modeling_flava.py | 2 +- .../models/funnel/tokenization_funnel.py | 2 +- src/transformers/models/gpt2/modeling_gpt2.py | 2 +- .../tokenization_gpt_neox_japanese.py | 2 +- .../granitemoe/configuration_granitemoe.py | 4 +- .../models/granitemoe/modeling_granitemoe.py | 5 ++- .../configuration_granitemoeshared.py | 4 +- .../modeling_granitemoeshared.py | 5 ++- .../grounding_dino/modeling_grounding_dino.py | 10 ++--- .../models/hubert/configuration_hubert.py | 4 +- .../models/idefics/modeling_idefics.py | 4 +- .../models/idefics/processing_idefics.py | 2 +- .../models/jamba/modeling_jamba.py | 4 +- .../models/jetmoe/configuration_jetmoe.py | 2 +- .../models/jetmoe/modeling_jetmoe.py | 2 +- .../models/layoutlm/tokenization_layoutlm.py | 2 +- .../layoutlmv2/tokenization_layoutlmv2.py | 2 +- .../models/layoutlmv3/modeling_layoutlmv3.py | 2 +- src/transformers/models/led/modeling_led.py | 2 +- .../llama4/convert_llama4_weights_to_hf.py | 4 +- .../models/llama4/modeling_llama4.py | 4 +- .../llava/convert_llava_weights_to_hf.py | 4 +- .../models/llava/processing_llava.py | 2 +- .../llava_next/image_processing_llava_next.py | 2 +- .../llava_next/processing_llava_next.py | 2 +- .../processing_llava_next_video.py | 2 +- .../convert_llava_onevision_weights_to_hf.py | 2 +- .../image_processing_llava_onevision.py | 4 +- .../processing_llava_onevision.py | 2 +- .../models/longt5/configuration_longt5.py | 2 +- .../convert_longt5x_checkpoint_to_flax.py | 2 +- .../models/longt5/modeling_flax_longt5.py | 2 +- .../models/longt5/modeling_longt5.py | 2 +- .../models/lxmert/configuration_lxmert.py | 2 +- .../models/lxmert/modeling_lxmert.py | 4 +- .../models/lxmert/modeling_tf_lxmert.py | 4 +- .../models/lxmert/tokenization_lxmert.py | 2 +- .../models/mamba/configuration_mamba.py | 2 +- .../models/mamba2/modeling_mamba2.py | 4 +- .../marian/convert_marian_to_pytorch.py | 2 +- .../models/marian/modeling_marian.py | 2 +- .../mask2former/modeling_mask2former.py | 10 ++--- .../models/maskformer/modeling_maskformer.py | 10 ++--- .../maskformer/modeling_maskformer_swin.py | 2 +- .../models/mistral/modeling_tf_mistral.py | 2 +- .../models/mixtral/configuration_mixtral.py | 2 +- .../mllama/convert_mllama_weights_to_hf.py | 2 +- .../mobilebert/tokenization_mobilebert.py | 2 +- .../models/modernbert/modeling_modernbert.py | 2 +- .../models/modernbert/modular_modernbert.py | 2 +- .../models/mpnet/tokenization_mpnet.py | 2 +- src/transformers/models/mpt/modeling_mpt.py | 2 +- .../mra/convert_mra_pytorch_to_pytorch.py | 2 +- .../modeling_musicgen_melody.py | 2 +- ..._original_pytorch_checkpoint_to_pytorch.py | 2 +- .../models/olmoe/configuration_olmoe.py | 2 +- .../oneformer/configuration_oneformer.py | 2 +- .../models/oneformer/modeling_oneformer.py | 12 ++--- .../models/owlv2/image_processing_owlv2.py | 2 +- .../models/owlv2/processing_owlv2.py | 2 +- .../models/owlvit/processing_owlvit.py | 2 +- .../models/patchtst/modeling_patchtst.py | 2 +- .../convert_phi4_multimodal_weights_to_hf.py | 2 +- .../feature_extraction_phi4_multimodal.py | 4 +- .../modeling_phi4_multimodal.py | 4 +- .../modular_phi4_multimodal.py | 4 +- .../pop2piano/feature_extraction_pop2piano.py | 10 ++--- .../pop2piano/tokenization_pop2piano.py | 2 +- .../image_processing_prompt_depth_anything.py | 14 +++--- .../prophetnet/tokenization_prophetnet.py | 2 +- .../pvt_v2/convert_pvt_v2_to_pytorch.py | 2 +- .../configuration_qwen2_5_omni.py | 6 +-- .../qwen2_5_omni/modeling_qwen2_5_omni.py | 8 ++-- .../qwen2_5_omni/modular_qwen2_5_omni.py | 14 +++--- .../qwen2_vl/image_processing_qwen2_vl.py | 4 +- .../image_processing_qwen2_vl_fast.py | 4 +- .../qwen3_moe/configuration_qwen3_moe.py | 2 +- .../models/rag/configuration_rag.py | 2 +- .../models/rag/modeling_tf_rag.py | 2 +- .../regnet/convert_regnet_to_pytorch.py | 2 +- .../models/regnet/modeling_flax_regnet.py | 2 +- .../resnet/convert_resnet_to_pytorch.py | 2 +- .../models/resnet/modeling_flax_resnet.py | 2 +- .../models/roc_bert/tokenization_roc_bert.py | 2 +- .../models/roformer/tokenization_roformer.py | 2 +- .../models/rt_detr/modeling_rt_detr.py | 2 +- .../models/rt_detr_v2/modeling_rt_detr_v2.py | 2 +- .../models/rwkv/configuration_rwkv.py | 2 +- .../models/sam/image_processing_sam.py | 12 ++--- src/transformers/models/sam/modeling_sam.py | 6 +-- .../models/sam/modeling_tf_sam.py | 2 +- src/transformers/models/sam/processing_sam.py | 2 +- .../seamless_m4t/modeling_seamless_m4t.py | 12 ++--- .../modeling_seamless_m4t_v2.py | 4 +- .../models/sew/configuration_sew.py | 4 +- .../models/sew_d/configuration_sew_d.py | 4 +- .../shieldgemma2/processing_shieldgemma2.py | 4 +- .../configuration_speech_encoder_decoder.py | 2 +- .../modeling_flax_speech_encoder_decoder.py | 2 +- .../modeling_speech_encoder_decoder.py | 2 +- .../speech_to_text/modeling_speech_to_text.py | 2 +- .../modeling_tf_speech_to_text.py | 2 +- .../models/speecht5/configuration_speecht5.py | 4 +- .../models/splinter/tokenization_splinter.py | 2 +- .../squeezebert/tokenization_squeezebert.py | 2 +- .../modeling_table_transformer.py | 6 +-- .../models/tapas/tokenization_tapas.py | 2 +- src/transformers/models/tvp/modeling_tvp.py | 2 +- .../unispeech/configuration_unispeech.py | 4 +- .../configuration_unispeech_sat.py | 4 +- .../video_llava/processing_video_llava.py | 2 +- .../modeling_flax_vision_encoder_decoder.py | 2 +- .../modeling_tf_vision_encoder_decoder.py | 2 +- .../modeling_vision_encoder_decoder.py | 2 +- .../modeling_flax_vision_text_dual_encoder.py | 4 +- .../modeling_tf_vision_text_dual_encoder.py | 4 +- .../modeling_vision_text_dual_encoder.py | 4 +- .../models/vit_mae/modeling_tf_vit_mae.py | 2 +- .../models/wav2vec2/configuration_wav2vec2.py | 4 +- .../models/wav2vec2/modeling_wav2vec2.py | 2 +- .../configuration_wav2vec2_bert.py | 4 +- .../convert_wav2vec2_seamless_checkpoint.py | 2 +- .../wav2vec2_bert/modeling_wav2vec2_bert.py | 2 +- .../wav2vec2_bert/modular_wav2vec2_bert.py | 2 +- .../configuration_wav2vec2_conformer.py | 4 +- .../modeling_wav2vec2_conformer.py | 2 +- .../tokenization_wav2vec2_phoneme.py | 2 +- .../models/whisper/configuration_whisper.py | 6 +-- .../whisper/feature_extraction_whisper.py | 4 +- .../models/whisper/generation_whisper.py | 10 ++--- .../models/whisper/tokenization_whisper.py | 6 +-- .../whisper/tokenization_whisper_fast.py | 4 +- .../models/x_clip/configuration_x_clip.py | 2 +- .../models/yolos/modeling_yolos.py | 6 +-- .../yoso/convert_yoso_pytorch_to_pytorch.py | 2 +- .../models/zamba/modeling_zamba.py | 2 +- .../models/zamba2/modeling_zamba2.py | 4 +- .../models/zamba2/modular_zamba2.py | 2 +- .../zoedepth/image_processing_zoedepth.py | 16 +++---- .../pipelines/automatic_speech_recognition.py | 2 +- src/transformers/pipelines/mask_generation.py | 2 +- src/transformers/quantizers/base.py | 2 +- .../quantizers/quantizer_torchao.py | 2 +- src/transformers/trainer.py | 2 +- src/transformers/utils/import_utils.py | 2 +- src/transformers/utils/quantization_config.py | 6 +-- tests/models/bert/test_modeling_tf_bert.py | 4 +- .../electra/test_modeling_tf_electra.py | 4 +- tests/models/esm/test_modeling_tf_esm.py | 2 +- .../rembert/test_modeling_tf_rembert.py | 4 +- .../roberta/test_modeling_tf_roberta.py | 4 +- .../test_modeling_tf_roberta_prelayernorm.py | 4 +- tests/test_processing_common.py | 2 +- 232 files changed, 431 insertions(+), 425 deletions(-) diff --git a/src/transformers/generation/streamers.py b/src/transformers/generation/streamers.py index c0fc26442b..863fd67bdd 100644 --- a/src/transformers/generation/streamers.py +++ b/src/transformers/generation/streamers.py @@ -162,7 +162,7 @@ class TextStreamer(BaseStreamer): class TextIteratorStreamer(TextStreamer): """ Streamer that stores print-ready text in a queue, to be used by a downstream application as an iterator. This is - useful for applications that benefit from acessing the generated text in a non-blocking way (e.g. in an interactive + useful for applications that benefit from accessing the generated text in a non-blocking way (e.g. in an interactive Gradio demo). @@ -233,7 +233,7 @@ class TextIteratorStreamer(TextStreamer): class AsyncTextIteratorStreamer(TextStreamer): """ Streamer that stores print-ready text in a queue, to be used by a downstream application as an async iterator. - This is useful for applications that benefit from acessing the generated text asynchronously (e.g. in an + This is useful for applications that benefit from accessing the generated text asynchronously (e.g. in an interactive Gradio demo). diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py index 344147e6e3..b2da6f18ba 100644 --- a/src/transformers/generation/tf_utils.py +++ b/src/transformers/generation/tf_utils.py @@ -2082,7 +2082,7 @@ class TFGenerationMixin: def gather_fn(tensor): if batch_axis > 0: - # pushes all dimentions before the batch to the end, so we get (batch, beam_id, ...) + # pushes all dimensions before the batch to the end, so we get (batch, beam_id, ...) perm = tf.concat((tf.range(tf.rank(tensor))[batch_axis:], tf.range(batch_axis)), axis=0) tensor = tf.transpose(tensor, perm=perm) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index c0f6da8d76..995b556b7e 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1285,7 +1285,7 @@ class GenerationMixin: Merge user-defined processors/criteria with the ones instantiated inside `generate`. In case the same processor/criteria is present on both lists, use the user-defined one. - (Note: up to v4.49.0, this funtion threw an exception is the same logit processor was found twice.) + (Note: up to v4.49.0, this function threw an exception is the same logit processor was found twice.) """ if len(custom_list) == 0: return default_list @@ -3852,7 +3852,7 @@ class GenerationMixin: model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) - # (joao) feature lost in the refactor. Probably won't implement, hurts readbility with minimal gains (there + # (joao) feature lost in the refactor. Probably won't implement, hurts readability with minimal gains (there # are newer low-memory alternatives like the offloaded cache) sequential = generation_config.low_memory if sequential: diff --git a/src/transformers/generation/watermarking.py b/src/transformers/generation/watermarking.py index e5f900c3b7..139d8cf2c7 100644 --- a/src/transformers/generation/watermarking.py +++ b/src/transformers/generation/watermarking.py @@ -538,7 +538,7 @@ class SynthIDTextWatermarkDetector: context_repetition_mask = self.logits_processor.compute_context_repetition_mask( input_ids=tokenized_outputs, ) - # context repitition mask shape [batch_size, output_len - (ngram_len - 1)] + # context repetition mask shape [batch_size, output_len - (ngram_len - 1)] combined_mask = context_repetition_mask * eos_token_mask diff --git a/src/transformers/integrations/aqlm.py b/src/transformers/integrations/aqlm.py index 0626da7ace..cdbed289bd 100644 --- a/src/transformers/integrations/aqlm.py +++ b/src/transformers/integrations/aqlm.py @@ -30,7 +30,7 @@ def replace_with_aqlm_linear( """ Public method that recursively replaces the Linear layers of the given model with AQLM quantized layers. `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the - conversion has been successfull or not. + conversion has been successful or not. Args: model (`torch.nn.Module`): diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index c860ea1f53..23a418ead5 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -101,7 +101,7 @@ def replace_with_awq_linear( """ Public method that recursively replaces the Linear layers of the given model with AWQ quantized layers. `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the - conversion has been successfull or not. + conversion has been successful or not. During the module replacement, we also infer the backend to use through the `quantization_config` object. diff --git a/src/transformers/integrations/bitnet.py b/src/transformers/integrations/bitnet.py index 0b50f9738a..aafca87856 100644 --- a/src/transformers/integrations/bitnet.py +++ b/src/transformers/integrations/bitnet.py @@ -201,7 +201,7 @@ def _replace_with_bitnet_linear( """ Private method that wraps the recursion for module replacement. - Returns the converted model and a boolean that indicates if the conversion has been successfull or not. + Returns the converted model and a boolean that indicates if the conversion has been successful or not. """ if current_key_name is None: diff --git a/src/transformers/integrations/bitsandbytes.py b/src/transformers/integrations/bitsandbytes.py index 3973dc58c0..7a6f6e107a 100644 --- a/src/transformers/integrations/bitsandbytes.py +++ b/src/transformers/integrations/bitsandbytes.py @@ -158,7 +158,7 @@ def _replace_with_bnb_linear( """ Private method that wraps the recursion for module replacement. - Returns the converted model and a boolean that indicates if the conversion has been successfull or not. + Returns the converted model and a boolean that indicates if the conversion has been successful or not. """ for name, module in model.named_children(): if current_key_name is None: @@ -280,7 +280,7 @@ def replace_8bit_linear(*args, **kwargs): return replace_with_bnb_linear(*args, **kwargs) -# For backward compatiblity +# For backward compatibility def set_module_8bit_tensor_to_device(*args, **kwargs): warnings.warn( "`set_module_8bit_tensor_to_device` will be deprecated in a future version, please use `set_module_quantized_tensor_to_device` instead", @@ -403,7 +403,7 @@ def _dequantize_and_replace( some performance drop compared to the original model before quantization - use it only for specific usecases such as QLoRA adapters merging. - Returns the converted model and a boolean that indicates if the conversion has been successfull or not. + Returns the converted model and a boolean that indicates if the conversion has been successful or not. """ quant_method = quantization_config.quantization_method() diff --git a/src/transformers/integrations/eetq.py b/src/transformers/integrations/eetq.py index 97698cf1aa..a3d124aa4b 100644 --- a/src/transformers/integrations/eetq.py +++ b/src/transformers/integrations/eetq.py @@ -36,7 +36,7 @@ def _replace_with_eetq_linear( """ Private method that wraps the recursion for module replacement. - Returns the converted model and a boolean that indicates if the conversion has been successfull or not. + Returns the converted model and a boolean that indicates if the conversion has been successful or not. """ if current_key_name is None: current_key_name = [] diff --git a/src/transformers/integrations/fbgemm_fp8.py b/src/transformers/integrations/fbgemm_fp8.py index 1cc5a8b23a..ba4faa96c2 100644 --- a/src/transformers/integrations/fbgemm_fp8.py +++ b/src/transformers/integrations/fbgemm_fp8.py @@ -167,7 +167,7 @@ def _replace_with_fbgemm_fp8_linear( """ Private method that wraps the recursion for module replacement. - Returns the converted model and a boolean that indicates if the conversion has been successfull or not. + Returns the converted model and a boolean that indicates if the conversion has been successful or not. """ import re @@ -196,7 +196,7 @@ def _replace_with_fbgemm_fp8_linear( # Force requires grad to False to avoid unexpected errors model._modules[name].requires_grad_(False) - # set non persistant buffer outside of init_empty_weights + # set non persistent buffer outside of init_empty_weights model._modules[name].input_scale_ub = torch.tensor( [quantization_config.activation_scale_ub], dtype=torch.float, diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index f1a25f2744..51bdc88608 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -424,7 +424,7 @@ class GGUFLlamaConverter(LlamaConverter): if post_processor: tokenizer.post_processor = post_processor - # HACK: patch the llama-3 tokenizer to use the correspinding pre-tokenizer + # HACK: patch the llama-3 tokenizer to use the corresponding pre-tokenizer # and normalizer if self.is_llama_3_tokenizer: tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( diff --git a/src/transformers/integrations/higgs.py b/src/transformers/integrations/higgs.py index dd31764dfe..02c9a23dc6 100644 --- a/src/transformers/integrations/higgs.py +++ b/src/transformers/integrations/higgs.py @@ -558,7 +558,7 @@ def replace_with_higgs_linear( """ Public method that recursively replaces the Linear layers of the given model with HIGGS quantized layers. `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the - conversion has been successfull or not. + conversion has been successful or not. Args: model (`torch.nn.Module`): diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index a696612c3b..f18eb43c78 100755 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -1093,7 +1093,7 @@ class CometCallback(TrainerCallback): if state.is_hyper_param_search: if mode is not None: logger.warning( - "Hyperparameter Search is enabled, forcing the creation of new experimetns, COMET_MODE value %r is ignored", + "Hyperparameter Search is enabled, forcing the creation of new experiments, COMET_MODE value %r is ignored", comet_old_mode, ) mode = "create" diff --git a/src/transformers/integrations/peft.py b/src/transformers/integrations/peft.py index 6aa3b137b1..0c9402abe6 100644 --- a/src/transformers/integrations/peft.py +++ b/src/transformers/integrations/peft.py @@ -350,7 +350,7 @@ class PeftAdapterMixin: for _, module in self.named_modules(): if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)): - # For backward compatbility with previous PEFT versions + # For backward compatibility with previous PEFT versions if hasattr(module, "set_adapter"): module.set_adapter(adapter_name) else: diff --git a/src/transformers/integrations/quanto.py b/src/transformers/integrations/quanto.py index 1c57023219..c63b9b3b0c 100644 --- a/src/transformers/integrations/quanto.py +++ b/src/transformers/integrations/quanto.py @@ -30,7 +30,7 @@ def replace_with_quanto_layers( ): """ Public method that recursively replaces the Linear layers of the given model with Quanto quantized layers. - Returns the converted model and a boolean that indicates if the conversion has been successfull or not. + Returns the converted model and a boolean that indicates if the conversion has been successful or not. Args: model (`torch.nn.Module`): diff --git a/src/transformers/integrations/tensor_parallel.py b/src/transformers/integrations/tensor_parallel.py index d1fef6d492..61bf29224d 100644 --- a/src/transformers/integrations/tensor_parallel.py +++ b/src/transformers/integrations/tensor_parallel.py @@ -611,14 +611,14 @@ def add_tensor_parallel_hooks_to_module(model, module, tp_plan, layer_name, curr f"Trying to prepare {layer_name}, but it's not supported. Corresponding module: {module} Fix it's TP plan: {e}" ) - # 2. We add hooks to the parrent module if needed + # 2. We add hooks to the parent module if needed if "." in layer_name: - parrent_layer_name = layer_name.rsplit(".", 1)[0] - generic_name = re.sub(r"\d+", "*", parrent_layer_name) + parent_layer_name = layer_name.rsplit(".", 1)[0] + generic_name = re.sub(r"\d+", "*", parent_layer_name) # The module itself needs hooks if module_plan := tp_plan.get(generic_name, False): tp_layer = translate_to_torch_parallel_style(module_plan) - module_to_tp_ = model.get_submodule(parrent_layer_name) + module_to_tp_ = model.get_submodule(parent_layer_name) tp_layer.prepare_module_tp(module_to_tp_, device_mesh) diff --git a/src/transformers/integrations/vptq.py b/src/transformers/integrations/vptq.py index aa435517e8..f76bd70377 100644 --- a/src/transformers/integrations/vptq.py +++ b/src/transformers/integrations/vptq.py @@ -28,7 +28,7 @@ def replace_with_vptq_linear( """ Public method that recursively replaces the Linear layers of the given model with VPTQ quantized layers. `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the - conversion has been successfull or not. + conversion has been successful or not. Args: model (`torch.nn.Module`): diff --git a/src/transformers/loss/loss_rt_detr.py b/src/transformers/loss/loss_rt_detr.py index 88a4ac7cf4..a65d06de10 100644 --- a/src/transformers/loss/loss_rt_detr.py +++ b/src/transformers/loss/loss_rt_detr.py @@ -112,7 +112,7 @@ class RTDetrHungarianMatcher(nn.Module): # Compute the L1 cost between boxes bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) - # Compute the giou cost betwen boxes + # Compute the giou cost between boxes giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) # Compute the final cost matrix cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index e26801e106..5fbf51c297 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1395,7 +1395,7 @@ def _find_mismatched_keys( for key in new_state_dict.keys(): if key in model_state_dict and new_state_dict[key].shape != model_state_dict[key].shape: # This skips size mismatches for 4-bit weights. Two 4-bit values share an 8-bit container, causing size differences. - # Without matching with module type or paramter type it seems like a practical way to detect valid 4bit weights. + # Without matching with module type or parameter type it seems like a practical way to detect valid 4bit weights. if not ( new_state_dict[key].shape[-1] == 1 and new_state_dict[key].numel() * 2 == model_state_dict[key].numel() diff --git a/src/transformers/models/bamba/modeling_bamba.py b/src/transformers/models/bamba/modeling_bamba.py index 074eea3aa9..a6220fbf92 100644 --- a/src/transformers/models/bamba/modeling_bamba.py +++ b/src/transformers/models/bamba/modeling_bamba.py @@ -463,7 +463,7 @@ class BambaMixer(nn.Module): projection_size, bias=self.use_bias, ) - # selective projection used to make dt, B and C input dependant + # selective projection used to make dt, B and C input dependent # time step projection (discretization) # instantiate once and copy inv_dt in init_weights of PretrainedModel @@ -1541,7 +1541,7 @@ class BambaForCausalLM(BambaPreTrainedModel, GenerationMixin): use_cache=True, **kwargs, ): - # Overwitten -- has a unique cache type, `HybridMambaAttentionDynamicCache` + # Overwritten -- has a unique cache type, `HybridMambaAttentionDynamicCache` empty_past_kv = past_key_values is None diff --git a/src/transformers/models/bamba/modular_bamba.py b/src/transformers/models/bamba/modular_bamba.py index dd0d0e62c6..3ee6e72711 100644 --- a/src/transformers/models/bamba/modular_bamba.py +++ b/src/transformers/models/bamba/modular_bamba.py @@ -260,7 +260,7 @@ class BambaMixer(nn.Module): projection_size, bias=self.use_bias, ) - # selective projection used to make dt, B and C input dependant + # selective projection used to make dt, B and C input dependent # time step projection (discretization) # instantiate once and copy inv_dt in init_weights of PretrainedModel @@ -1257,7 +1257,7 @@ class BambaForCausalLM(LlamaForCausalLM): use_cache=True, **kwargs, ): - # Overwitten -- has a unique cache type, `HybridMambaAttentionDynamicCache` + # Overwritten -- has a unique cache type, `HybridMambaAttentionDynamicCache` empty_past_kv = past_key_values is None diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py index 57a0c4e5a7..2357fd5375 100644 --- a/src/transformers/models/bark/modeling_bark.py +++ b/src/transformers/models/bark/modeling_bark.py @@ -1296,7 +1296,7 @@ class BarkFineModel(BarkPreTrainedModel): @add_start_docstrings_to_model_forward(BARK_FINE_INPUTS_DOCSTRING) def forward( self, - codebook_idx: int, # an additionnal idx corresponding to the id of the codebook that will be predicted + codebook_idx: int, # an additional idx corresponding to the id of the codebook that will be predicted input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, @@ -1547,7 +1547,7 @@ class BarkFineModel(BarkPreTrainedModel): - [`BarkSemanticModel`] (also referred to as the 'text' model): a causal auto-regressive transformer model that takes as input tokenized text, and predicts semantic text tokens that capture the meaning of the text. - - [`BarkCoarseModel`] (also refered to as the 'coarse acoustics' model), also a causal autoregressive transformer, + - [`BarkCoarseModel`] (also referred to as the 'coarse acoustics' model), also a causal autoregressive transformer, that takes into input the results of the last model. It aims at regressing the first two audio codebooks necessary to `encodec`. - [`BarkFineModel`] (the 'fine acoustics' model), this time a non-causal autoencoder transformer, which iteratively @@ -1640,7 +1640,7 @@ class BarkModel(BarkPreTrainedModel): self.to("cpu") torch_accelerator_module.empty_cache() # otherwise we don't see the memory savings (but they probably exist) - # this layer is used outside the first foward pass of semantic so need to be loaded before semantic + # this layer is used outside the first forward pass of semantic so need to be loaded before semantic self.semantic.input_embeds_layer, _ = cpu_offload_with_hook(self.semantic.input_embeds_layer, device) hook = None diff --git a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py index cba1e1a2c3..f39ed47a19 100644 --- a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py +++ b/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py @@ -67,10 +67,10 @@ def convert_checkpoint_to_pytorch(tf_checkpoint_path: str, config_path: str, pyt return torch.from_numpy(array) - def get_encoder_attention_layer_array(layer_index: int, name: str, orginal_shape): + def get_encoder_attention_layer_array(layer_index: int, name: str, original_shape): full_name = f"encoder/_transformer_layers/{layer_index}/_attention_layer/{name}/.ATTRIBUTES/VARIABLE_VALUE" array = tf.train.load_variable(tf_checkpoint_path, full_name) - array = array.reshape(orginal_shape) + array = array.reshape(original_shape) if "kernel" in name: array = array.transpose() diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py index 42d4dd9455..141f10464a 100644 --- a/src/transformers/models/bert/tokenization_bert.py +++ b/src/transformers/models/bert/tokenization_bert.py @@ -460,7 +460,7 @@ class WordpieceTokenizer: Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary. - For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`. + For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. Args: text: A single token or whitespace separated tokens. This should have diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py index 8a841a3091..3a2b6f46f8 100644 --- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py +++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py @@ -667,7 +667,7 @@ class CharacterTokenizer: """ Tokenizes a piece of text into characters. - For example, `input = "apple""` wil return as output `["a", "p", "p", "l", "e"]`. + For example, `input = "apple""` will return as output `["a", "p", "p", "l", "e"]`. Args: text: A single token or whitespace separated tokens. @@ -866,7 +866,7 @@ class WordpieceTokenizer: Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary. - For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`. + For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. Args: text: A single token or whitespace separated tokens. This should have diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py index abc5a1df44..3b250a056e 100755 --- a/src/transformers/models/big_bird/modeling_big_bird.py +++ b/src/transformers/models/big_bird/modeling_big_bird.py @@ -1171,7 +1171,7 @@ class BigBirdBlockSparseAttention(nn.Module): if plan_idx > 0: # set the row for all from_blocks starting from 0 to # plan_block_length[plan_idx-1] - # column indx start fromm plan_block_length[plan_idx-1] and ends at + # column indx start from plan_block_length[plan_idx-1] and ends at # plan_block_length[plan_idx] if plan_num_rand_blocks[plan_idx] > 0: rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx])) diff --git a/src/transformers/models/big_bird/modeling_flax_big_bird.py b/src/transformers/models/big_bird/modeling_flax_big_bird.py index 7f43a4c5ab..e3bdfc38da 100644 --- a/src/transformers/models/big_bird/modeling_flax_big_bird.py +++ b/src/transformers/models/big_bird/modeling_flax_big_bird.py @@ -1055,7 +1055,7 @@ class FlaxBigBirdBlockSparseAttention(nn.Module): from_block_size: int. size of block in from sequence. to_block_size: int. size of block in to sequence. num_heads: int. total number of heads. - plan_from_length: list. plan from length where num_random_blocks are choosen from. + plan_from_length: list. plan from length where num_random_blocks are chosen from. plan_num_rand_blocks: list. number of rand blocks within the plan. indices_prng_key: jax.random.PRNGKey. PRNG key that is used to perform random jax operations. deterministic: bool. When False random attention will be used. @@ -1104,7 +1104,7 @@ class FlaxBigBirdBlockSparseAttention(nn.Module): if plan_idx > 0: # set the row for all from_blocks starting from 0 to # plan_block_length[plan_idx-1] - # column indx start fromm plan_block_length[plan_idx-1] and ends at + # column indx start from plan_block_length[plan_idx-1] and ends at # plan_block_length[plan_idx] if plan_num_rand_blocks[plan_idx] > 0: rnd_r_cnt = int(sum(plan_num_rand_blocks[:plan_idx])) diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index 6827a4a188..ee81c6b3af 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -970,7 +970,7 @@ class BigBirdPegasusBlockSparseAttention(nn.Module): if plan_idx > 0: # set the row for all from_blocks starting from 0 to # plan_block_length[plan_idx-1] - # column indx start fromm plan_block_length[plan_idx-1] and ends at + # column indx start from plan_block_length[plan_idx-1] and ends at # plan_block_length[plan_idx] if plan_num_rand_blocks[plan_idx] > 0: rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx])) diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py index 1f248ab8be..3b4d44afcd 100644 --- a/src/transformers/models/blip/modeling_blip.py +++ b/src/transformers/models/blip/modeling_blip.py @@ -64,7 +64,7 @@ class BlipForConditionalGenerationModelOutput(ModelOutput): Args: loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): - Languge modeling loss from the text decoder. + Language modeling loss from the text decoder. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*): Prediction scores of the language modeling head of the text decoder model. image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*): @@ -109,7 +109,7 @@ class BlipTextVisionModelOutput(ModelOutput): Args: loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): - Languge modeling loss from the text decoder. + Language modeling loss from the text decoder. image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): The image embeddings obtained by applying the projection layer to the pooler_output. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -145,7 +145,7 @@ class BlipImageTextMatchingModelOutput(ModelOutput): itm_score (`torch.FloatTensor`): The image-text similarity scores. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): - Languge modeling loss from the text decoder. + Language modeling loss from the text decoder. image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): The image embeddings obtained by applying the projection layer to the pooler_output. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py index 9573ca0fbb..36e8a2da7a 100644 --- a/src/transformers/models/blip/modeling_tf_blip.py +++ b/src/transformers/models/blip/modeling_tf_blip.py @@ -73,7 +73,7 @@ class TFBlipForConditionalGenerationModelOutput(ModelOutput): Args: loss (`tf.Tensor`, *optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`): - Languge modeling loss from the text decoder. + Language modeling loss from the text decoder. logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*): Prediction scores of the language modeling head of the text decoder model. image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)`, *optional*): @@ -118,7 +118,7 @@ class TFBlipTextVisionModelOutput(ModelOutput): Args: loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): - Languge modeling loss from the text decoder. + Language modeling loss from the text decoder. image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): The image embeddings obtained by applying the projection layer to the pooler_output. last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -154,7 +154,7 @@ class TFBlipImageTextMatchingModelOutput(ModelOutput): itm_score (`tf.Tensor`): The image-text similarity scores. loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): - Languge modeling loss from the text decoder. + Language modeling loss from the text decoder. image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): The image embeddings obtained by applying the projection layer to the pooler_output. last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): diff --git a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py b/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py index 73d251875d..c4aa6f27c9 100644 --- a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py +++ b/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py @@ -116,12 +116,12 @@ def convert_bloom_checkpoint_to_pytorch( else: for key in tensors.keys(): if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH): - # We average (sum and then divide) some weights accross TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425) + # We average (sum and then divide) some weights across TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425) tensors[key] += temp[key] else: # Some weights are RowParallelLinear in Megatron-Deepspeed, others are ColumnParallel cat_dim = 1 if any(text in key for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0 - # We concatenate these weights accross TP ranks + # We concatenate these weights across TP ranks tensors[key] = torch.cat([tensors[key], temp[key]], dim=cat_dim) # Divide by the number of TP the weights we want to average @@ -175,13 +175,13 @@ def convert_bloom_checkpoint_to_pytorch( tensors = temp else: for key in tensors.keys(): - # We average (sum and then divide) some weights accross TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425) + # We average (sum and then divide) some weights across TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425) if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH): tensors[key] += temp[key] else: # Some weights are RowParallelLinear in Megatron-Deepspeed, others are ColumnParallel cat_dim = 1 if any(text in key for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0 - # We concatenate these weights accross TP ranks + # We concatenate these weights across TP ranks tensors[key] = torch.cat([tensors[key], temp[key]], dim=cat_dim) # Divide by the number of TP the weights we want to average diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py index 23cc569d49..76f95b5f8c 100644 --- a/src/transformers/models/camembert/tokenization_camembert.py +++ b/src/transformers/models/camembert/tokenization_camembert.py @@ -181,7 +181,7 @@ class CamembertTokenizer(PreTrainedTokenizer): def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" - # specifi to camembert, both 3 and 4 point to the unk token. + # specific to camembert, both 3 and 4 point to the unk token. if self.sp_model.PieceToId(token) == 0: # Convert sentence piece unk token to fairseq unk token index return self.unk_token_id diff --git a/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py b/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py index f74607f7b3..59b253b5ec 100644 --- a/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py +++ b/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py @@ -384,7 +384,7 @@ def write_model(model_path, input_base_path, model_size, chameleon_version=1): tokenizer_file=os.path.join(input_base_path, "tokenizer/text_tokenizer_modified.json"), legacy=False ) tokenizer.sep_token_id = 8710 # assign to sep so that we can append it after input text - tokenizer.pad_token_id = 1 # assing to special pad_token + tokenizer.pad_token_id = 1 # assign to special pad_token image_processor = ChameleonImageProcessor() processor = ChameleonProcessor(image_processor=image_processor, tokenizer=tokenizer) processor.save_pretrained(model_path) diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 9a7d43bdb5..b03336ce7e 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -124,7 +124,7 @@ class ChameleonLinearScalingRotaryEmbedding(ChameleonRotaryEmbedding): """ChameleonRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" def forward(self, x, position_ids): - # difference to the original RoPE: a scaling factor is aplied to the position ids + # difference to the original RoPE: a scaling factor is applied to the position ids position_ids = position_ids.float() / self.scaling_factor cos, sin = super().forward(x, position_ids) return cos, sin diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index a7a51cc86a..4f89deed49 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -864,9 +864,9 @@ class ClapAudioEncoder(nn.Module): _, _, time_length, freq_length = normalized_input_features.shape spec_width = int(self.spec_size * self.freq_ratio) - spec_heigth = self.spec_size // self.freq_ratio + spec_height = self.spec_size // self.freq_ratio - if time_length > spec_width or freq_length > spec_heigth: + if time_length > spec_width or freq_length > spec_height: raise ValueError("the wav size should be less than or equal to the swin input size") # to avoid bicubic zero error @@ -874,14 +874,14 @@ class ClapAudioEncoder(nn.Module): normalized_input_features = nn.functional.interpolate( normalized_input_features, (spec_width, freq_length), mode="bicubic", align_corners=True ) - if freq_length < spec_heigth: + if freq_length < spec_height: normalized_input_features = nn.functional.interpolate( - normalized_input_features, (time_length, spec_heigth), mode="bicubic", align_corners=True + normalized_input_features, (time_length, spec_height), mode="bicubic", align_corners=True ) batch, channels, time, freq = normalized_input_features.shape - # batch_size, channels, spec_width, spec_heigth --> batch_size, channels, spec_heigth * freq_ratio, spec_width // freq_ratio + # batch_size, channels, spec_width, spec_height --> batch_size, channels, spec_height * freq_ratio, spec_width // freq_ratio normalized_input_features = normalized_input_features.reshape( batch, channels * self.freq_ratio, time // self.freq_ratio, freq ) diff --git a/src/transformers/models/clvp/feature_extraction_clvp.py b/src/transformers/models/clvp/feature_extraction_clvp.py index ce6f5c782e..6a4965971c 100644 --- a/src/transformers/models/clvp/feature_extraction_clvp.py +++ b/src/transformers/models/clvp/feature_extraction_clvp.py @@ -49,9 +49,9 @@ class ClvpFeatureExtractor(SequenceFeatureExtractor): The default length of raw audio in seconds. If `max_length` is not set during `__call__` then it will automatically be set to default_audio_length * `self.sampling_rate`. hop_length (`int`, *optional*, defaults to 256): - Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients. + Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients. chunk_length (`int`, *optional*, defaults to 30): - The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio + The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio sequences. n_fft (`int`, *optional*, defaults to 1024): Size of the Fourier transform. diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index 90b7b68bb4..13fde28d02 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -147,7 +147,7 @@ class ConditionalDetrObjectDetectionOutput(ModelOutput): possible padding). You can use [`~ConditionalDetrImageProcessor.post_process_object_detection`] to retrieve the unnormalized bounding boxes. auxiliary_outputs (`list[Dict]`, *optional*): - Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and `pred_boxes`) for each decoder layer. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): @@ -1550,8 +1550,8 @@ class ConditionalDetrModel(ConditionalDetrPreTrainedModel): flattened_mask = mask.flatten(1) # Fourth, sent flattened_features + flattened_mask + object_queries through encoder - # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size) - # flattened_mask is a Tensor of shape (batch_size, heigth*width) + # flattened_features is a Tensor of shape (batch_size, height*width, hidden_size) + # flattened_mask is a Tensor of shape (batch_size, height*width) if encoder_outputs is None: encoder_outputs = self.encoder( inputs_embeds=flattened_features, @@ -1908,8 +1908,8 @@ class ConditionalDetrForSegmentation(ConditionalDetrPreTrainedModel): flattened_mask = mask.flatten(1) # Fourth, sent flattened_features + flattened_mask + object_queries through encoder - # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size) - # flattened_mask is a Tensor of shape (batch_size, heigth*width) + # flattened_features is a Tensor of shape (batch_size, height*width, hidden_size) + # flattened_mask is a Tensor of shape (batch_size, height*width) if encoder_outputs is None: encoder_outputs = self.conditional_detr.model.encoder( inputs_embeds=flattened_features, @@ -2046,7 +2046,7 @@ class ConditionalDetrMaskHeadSmallConv(nn.Module): nn.init.constant_(m.bias, 0) def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]): - # here we concatenate x, the projected feature map, of shape (batch_size, d_model, heigth/32, width/32) with + # here we concatenate x, the projected feature map, of shape (batch_size, d_model, height/32, width/32) with # the bbox_mask = the attention maps of shape (batch_size, n_queries, n_heads, height/32, width/32). # We expand the projected feature map to match the number of heads. x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1) diff --git a/src/transformers/models/convbert/tokenization_convbert.py b/src/transformers/models/convbert/tokenization_convbert.py index c2d6842838..1593e77ef4 100644 --- a/src/transformers/models/convbert/tokenization_convbert.py +++ b/src/transformers/models/convbert/tokenization_convbert.py @@ -465,7 +465,7 @@ class WordpieceTokenizer: Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary. - For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`. + For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. Args: text: A single token or whitespace separated tokens. This should have diff --git a/src/transformers/models/cpmant/modeling_cpmant.py b/src/transformers/models/cpmant/modeling_cpmant.py index df0aebe3cb..a587a11f87 100755 --- a/src/transformers/models/cpmant/modeling_cpmant.py +++ b/src/transformers/models/cpmant/modeling_cpmant.py @@ -454,7 +454,7 @@ class CpmAntSegmentPositionEmbedding(nn.Module): ) if querylen != query_segment.size(1): raise AssertionError( - f"querylen should be equal to query_segment.size(1), but got {querylen} and {query_segment.szie(1)}!" + f"querylen should be equal to query_segment.size(1), but got {querylen} and {query_segment.size(1)}!" ) key_pos = key_pos.view(batch, -1, keylen) diff --git a/src/transformers/models/dab_detr/modeling_dab_detr.py b/src/transformers/models/dab_detr/modeling_dab_detr.py index 84d6f276a8..3a60a07edf 100644 --- a/src/transformers/models/dab_detr/modeling_dab_detr.py +++ b/src/transformers/models/dab_detr/modeling_dab_detr.py @@ -143,7 +143,7 @@ class DabDetrObjectDetectionOutput(ModelOutput): possible padding). You can use [`~DabDetrImageProcessor.post_process_object_detection`] to retrieve the unnormalized bounding boxes. auxiliary_outputs (`list[Dict]`, *optional*): - Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and `pred_boxes`) for each decoder layer. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): @@ -1415,8 +1415,8 @@ class DabDetrModel(DabDetrPreTrainedModel): reference_position_embeddings = self.query_refpoint_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1) # Fourth, sent flattened_features + flattened_mask + object_queries through encoder - # flattened_features is a Tensor of shape (heigth*width, batch_size, hidden_size) - # flattened_mask is a Tensor of shape (batch_size, heigth*width) + # flattened_features is a Tensor of shape (height*width, batch_size, hidden_size) + # flattened_mask is a Tensor of shape (batch_size, height*width) if encoder_outputs is None: encoder_outputs = self.encoder( inputs_embeds=flattened_features, diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py index c91b99e56c..bab37cc6c1 100644 --- a/src/transformers/models/data2vec/configuration_data2vec_audio.py +++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py @@ -90,7 +90,7 @@ class Data2VecAudioConfig(PretrainedConfig): Number of groups of 1D convolutional positional embeddings layer. mask_time_prob (`float`, *optional*, defaults to 0.05): Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking - procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If + procedure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector span to be masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the mask_time_length (`int`, *optional*, defaults to 10): @@ -101,7 +101,7 @@ class Data2VecAudioConfig(PretrainedConfig): mask_time_min_masks'' mask_feature_prob (`float`, *optional*, defaults to 0.0): Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The - masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over + masking procedure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is diff --git a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py index 813fad89dc..9a41ed6fb0 100644 --- a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py +++ b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py @@ -1666,7 +1666,7 @@ class TFData2VecVisionForSemanticSegmentation(TFData2VecVisionPreTrainedModel): features[i + 2] = ops[i + 2](features[i + 2]) logits = self.decode_head(features) - # Tranpose the logits to maintain consistency in the output formats. + # Transpose the logits to maintain consistency in the output formats. transposed_logits = tf.transpose(logits, perm=[0, 3, 1, 2]) auxiliary_logits = None diff --git a/src/transformers/models/decision_transformer/configuration_decision_transformer.py b/src/transformers/models/decision_transformer/configuration_decision_transformer.py index e677206aa0..436834c7e5 100644 --- a/src/transformers/models/decision_transformer/configuration_decision_transformer.py +++ b/src/transformers/models/decision_transformer/configuration_decision_transformer.py @@ -26,7 +26,7 @@ class DecisionTransformerConfig(PretrainedConfig): This is the configuration class to store the configuration of a [`DecisionTransformerModel`]. It is used to instantiate a Decision Transformer model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the standard - DecisionTransformer architecture. Many of the config options are used to instatiate the GPT2 model that is used as + DecisionTransformer architecture. Many of the config options are used to instantiate the GPT2 model that is used as part of the architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py index 54000b8f24..22501ee508 100755 --- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py +++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py @@ -319,7 +319,7 @@ class DecisionTransformerGPT2Attention(nn.Module): else: # Attention functions are consistent with previous equivalent attention classes, however they do not support some options # (e.g. layer scaling, head mask) that eager supports. These implementations are thus equivalent to previous code, but - # not necessarily to eager (if mentionned options are provided). + # not necessarily to eager (if mentioned options are provided). attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] if using_eager and self.reorder_and_upcast_attn: diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index 4e177dde1a..a540bdc5d7 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -226,7 +226,7 @@ class DeformableDetrObjectDetectionOutput(ModelOutput): possible padding). You can use [`~DeformableDetrProcessor.post_process_object_detection`] to retrieve the unnormalized bounding boxes. auxiliary_outputs (`list[Dict]`, *optional*): - Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and `pred_boxes`) for each decoder layer. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): @@ -1578,8 +1578,8 @@ class DeformableDetrModel(DeformableDetrPreTrainedModel): scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2) grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale - width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level) - proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4) + width_height = torch.ones_like(grid) * 0.05 * (2.0**level) + proposal = torch.cat((grid, width_height), -1).view(batch_size, -1, 4) proposals.append(proposal) _cur += height * width output_proposals = torch.cat(proposals, 1) diff --git a/src/transformers/models/deprecated/deta/modeling_deta.py b/src/transformers/models/deprecated/deta/modeling_deta.py index 3e0b359947..ef4f0da573 100644 --- a/src/transformers/models/deprecated/deta/modeling_deta.py +++ b/src/transformers/models/deprecated/deta/modeling_deta.py @@ -267,7 +267,7 @@ class DetaObjectDetectionOutput(ModelOutput): possible padding). You can use [`~DetaProcessor.post_process_object_detection`] to retrieve the unnormalized bounding boxes. auxiliary_outputs (`list[Dict]`, *optional*): - Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and `pred_boxes`) for each decoder layer. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): @@ -1570,8 +1570,8 @@ class DetaModel(DetaPreTrainedModel): scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2) grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale - width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level) - proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4) + width_height = torch.ones_like(grid) * 0.05 * (2.0**level) + proposal = torch.cat((grid, width_height), -1).view(batch_size, -1, 4) proposals.append(proposal) _cur += height * width level_ids.append(grid.new_ones(height * width, dtype=torch.long) * level) @@ -2293,7 +2293,7 @@ class DetaLoss(nn.Module): else: indices = self.matcher(outputs_without_aux, targets) - # Compute the average number of target boxes accross all nodes, for normalization purposes + # Compute the average number of target boxes across all nodes, for normalization purposes num_boxes = sum(len(t["class_labels"]) for t in targets) num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) # Check that we have initialized the distributed state diff --git a/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py index 8ac9a13f5c..80f16881b5 100644 --- a/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py @@ -194,9 +194,9 @@ def convert_efficientformer_checkpoint( # Save Checkpoints Path(pytorch_dump_path).mkdir(exist_ok=True) model.save_pretrained(pytorch_dump_path) - print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}") + print(f"Checkpoint successfully converted. Model saved at {pytorch_dump_path}") processor.save_pretrained(pytorch_dump_path) - print(f"Processor successfuly saved at {pytorch_dump_path}") + print(f"Processor successfully saved at {pytorch_dump_path}") if push_to_hub: print("Pushing model to the hub...") diff --git a/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py index a84d000d44..8aa927d821 100644 --- a/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py +++ b/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py @@ -47,7 +47,7 @@ def convert_tf_gptsan_to_pt(args): player = int(key_name[9]) elif key_name.startswith("pasts/out"): player = 8 - name = "model.sqout.%d.weight" % (player * 2) # enter to nn.Sequencial with Tanh, so 2 at a time + name = "model.sqout.%d.weight" % (player * 2) # enter to nn.Sequential with Tanh, so 2 at a time state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix new_state[name] = torch.tensor(state) elif key_name.startswith("model/moe"): diff --git a/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py b/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py index a35ea4a311..17da733be9 100644 --- a/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py +++ b/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py @@ -258,12 +258,12 @@ class GPTSanJapaneseSparseMLP(nn.Module): expert the corresponding hidden states. """ - # Step 1: Get the router_mask from the router as wel as the probabilities + # Step 1: Get the router_mask from the router as well as the probabilities router_mask, router_probs, router_logits = self.router(hidden_states) expert_index = torch.argmax(router_mask, dim=-1) # The routers introduced might not always map all the tokens, to a router, which means that some hidden states - # can be unchanged from one layer to another. That is why the hidden states are cloned before updating only the seleced ones. + # can be unchanged from one layer to another. That is why the hidden states are cloned before updating only the selected ones. next_states = hidden_states.clone() for idx, expert in enumerate(self.experts.values()): @@ -905,7 +905,7 @@ class GPTSanJapaneseModel(GPTSanJapanesePreTrainedModel): Returns: `MoEModelOutputWithPastAndCrossAttentions` or `tuple` if `return_dict` returns - MoEModelOutputWithPastAndCrossAttentions insted of tuple + MoEModelOutputWithPastAndCrossAttentions instead of tuple """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict device = self.position_embeddings.weight.device @@ -1006,7 +1006,7 @@ class GPTSanJapaneseModel(GPTSanJapanesePreTrainedModel): if token_type_ids is not None: token_type_ids = token_type_ids.unsqueeze(1).unsqueeze(2) prefix_lm_mask = ((prefix_lm_mask + token_type_ids) > 0).float() - # Marge prefix_lm_mask and attention_mask + # Merge prefix_lm_mask and attention_mask extended_attention_mask = prefix_lm_mask * attention_mask.unsqueeze(1).unsqueeze(2) # Prepare head mask if needed @@ -1130,7 +1130,7 @@ class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel): labels in `[0, ..., config.vocab_size]` Returns: - `MoECausalLMOutputWithPast` or `tuple` if `return_dict` returns MoECausalLMOutputWithPast insted of tuple + `MoECausalLMOutputWithPast` or `tuple` if `return_dict` returns MoECausalLMOutputWithPast instead of tuple Example: diff --git a/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py index c93ea87278..9ffe4d9b14 100644 --- a/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py +++ b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py @@ -125,7 +125,7 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer): emoji_file (`str`): File containing the emoji. unk_token (`str`, *optional*, defaults to `"<|nottoken|>"`): - The token used for unknown charactor + The token used for unknown character pad_token (`str`, *optional*, defaults to `"<|separator|>"`): The token used for padding bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`): @@ -372,7 +372,7 @@ class SubWordJapaneseTokenizer: - Decoding byte0~byte255 tokens correctly - Added bagofword token handling - https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT Lisence according to the + https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT License according to the original repository. MIT License diff --git a/src/transformers/models/deprecated/graphormer/modeling_graphormer.py b/src/transformers/models/deprecated/graphormer/modeling_graphormer.py index 1253d1365e..7b91a429b0 100755 --- a/src/transformers/models/deprecated/graphormer/modeling_graphormer.py +++ b/src/transformers/models/deprecated/graphormer/modeling_graphormer.py @@ -740,7 +740,7 @@ class GraphormerPreTrainedModel(PreTrainedModel): Initialize the weights """ if isinstance(module, (nn.Linear, nn.Conv2d)): - # We might be missing part of the Linear init, dependant on the layer num + # We might be missing part of the Linear init, dependent on the layer num module.weight.data.normal_(mean=0.0, std=0.02) if module.bias is not None: module.bias.data.zero_() diff --git a/src/transformers/models/deprecated/jukebox/convert_jukebox.py b/src/transformers/models/deprecated/jukebox/convert_jukebox.py index aac3b2efe7..3380e38693 100644 --- a/src/transformers/models/deprecated/jukebox/convert_jukebox.py +++ b/src/transformers/models/deprecated/jukebox/convert_jukebox.py @@ -197,7 +197,7 @@ def fix_jukebox_keys(state_dict, model_state_dict, key_prefix, mapping): if f"{key_prefix}.{key}" not in model_state_dict or key is None: print(f"failed converting {original_key} to {key}, does not match") - # handle missmatched shape + # handle mismatched shape elif value.shape != model_state_dict[f"{key_prefix}.{key}"].shape: val = model_state_dict[f"{key_prefix}.{key}"] print(f"{original_key}-> {key} : \nshape {val.shape} and {value.shape}, do not match") diff --git a/src/transformers/models/deprecated/jukebox/modeling_jukebox.py b/src/transformers/models/deprecated/jukebox/modeling_jukebox.py index 566148ceda..3bff1d83ba 100755 --- a/src/transformers/models/deprecated/jukebox/modeling_jukebox.py +++ b/src/transformers/models/deprecated/jukebox/modeling_jukebox.py @@ -1308,11 +1308,11 @@ class JukeboxConditionalAutoregressive(nn.Module): Number of tokens or lyrics tokens provided in a single pass. embed_dim (`int`, *optional*): Either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codebook dimension, - if the model combines lyrics and music tokens, or simply n_vocab if the model is a seperate encoder + if the model combines lyrics and music tokens, or simply n_vocab if the model is a separate encoder audio_conditioning (`bool`, *optional*, defaults to `False`): - Whether or not the prior supports conditionning on audio. + Whether or not the prior supports conditioning on audio. metadata_conditioning (`bool`, *optional*, defaults to `False`): - Whether or not the prior supports conditionning on artitst, genres, lyrics and timing. + Whether or not the prior supports conditioning on artitst, genres, lyrics and timing. is_encoder (`bool`, *optional*, defaults to `False`): Whether the model is an encoder only model. """ @@ -1392,7 +1392,7 @@ class JukeboxConditionalAutoregressive(nn.Module): hidden_states = self.transformer( hidden_states, last_encoder_hidden_states=last_encoder_hidden_states ) # Transformer - if self.add_cond_after_transformer: # Piped doesnt add x_cond + if self.add_cond_after_transformer: # Piped doesn't add x_cond hidden_states = hidden_states + audio_conditioning activations = hidden_states @@ -1535,7 +1535,7 @@ class JukeboxConditionalAutoregressive(nn.Module): if get_preds: preds = [] - # Fill up key/value cache for past context by runing forward pass. + # Fill up key/value cache for past context by running forward pass. # We do so in chunks instead of doing the whole past in one forward pass to reduce max memory usage. if chunk_size is None: chunk_size = len(sampled_audio) @@ -1617,7 +1617,7 @@ class JukeboxConditionalAutoregressive(nn.Module): class JukeboxMusicTokenConditioner(nn.Module): """ - The `JukeboxMusicTokenConditioner` takes music tokens as an input (coresponding to the codes of the VQVAE's + The `JukeboxMusicTokenConditioner` takes music tokens as an input (corresponding to the codes of the VQVAE's codebook) and upsamples it using a single layer of decoder convolution block (the same is used in the VQVAE). """ @@ -1637,20 +1637,20 @@ class JukeboxMusicTokenConditioner(nn.Module): ) self.layer_norm = JukeboxLayerNorm(config.hidden_size) - def forward(self, music_tokens, raw_audio_conditionning=None): + def forward(self, music_tokens, raw_audio_conditioning=None): """ Args: music_tokens (`torch.LongTensor`): - Music tokens form the uper level in range(nb_discrete_codes) - raw_audio_conditionning (`torch.LongTensor`, *optional*): + Music tokens form the upper level in range(nb_discrete_codes) + raw_audio_conditioning (`torch.LongTensor`, *optional*): Audio used when primed sampling, raw audio information that conditions the generation """ - if raw_audio_conditionning is None: - raw_audio_conditionning = 0.0 + if raw_audio_conditioning is None: + raw_audio_conditioning = 0.0 # Embed music_tokens music_tokens = music_tokens.long() hidden_states = self.embed_tokens(music_tokens) - hidden_states = hidden_states + raw_audio_conditionning + hidden_states = hidden_states + raw_audio_conditioning # Run conditioner hidden_states = hidden_states.permute(0, 2, 1) @@ -1768,7 +1768,7 @@ class JukeboxPrior(PreTrainedModel): """ The JukeboxPrior class, which is a wrapper around the various conditioning and the transformer. JukeboxPrior can be seen as language models trained on music. They model the next `music token` prediction task. If a (lyric) `encoderù - is defined, it also models the `next character` prediction on the lyrics. Can be conditionned on timing, artist, + is defined, it also models the `next character` prediction on the lyrics. Can be conditioned on timing, artist, genre, lyrics and codes from lower-levels Priors. Args: @@ -1809,7 +1809,7 @@ class JukeboxPrior(PreTrainedModel): elif isinstance(module, JukeboxConditionalAutoregressive) and hasattr(module, "start_token"): module.start_token.data.normal_(mean=0.0, std=0.01 * init_scale) elif isinstance(module, JukeboxResConv1DBlock) and self.config.zero_out: - module.conv1d_2.weigth.data.zero_() + module.conv1d_2.weight.data.zero_() module.conv1d_2.bias.data.zero_() if isinstance(module, nn.LayerNorm): module.bias.data.zero_() @@ -1931,7 +1931,7 @@ class JukeboxPrior(PreTrainedModel): tokens_list = torch.zeros( (labels.shape[0], self.nb_relevant_lyric_tokens), dtype=torch.long, device=labels.device ) - indices_list = [] # whats the index of each current character in original array + indices_list = [] # what's the index of each current character in original array for idx in range(labels.shape[0]): full_tokens = labels.clone()[:, 4 + self.metadata_embedding.max_nb_genres :] total_length, offset, duration = labels[idx, 0], labels[idx, 1], labels[idx, 2] @@ -2073,12 +2073,12 @@ class JukeboxPrior(PreTrainedModel): n_samples (`int`): Number of samples to generate. music_tokens (`List[torch.LongTensor]`, *optional*): - Previously gemerated tokens at the current level. Used as context for the generation. + Previously generated tokens at the current level. Used as context for the generation. music_tokens_conds (`List[torch.FloatTensor]`, *optional*): Upper-level music tokens generated by the previous prior model. Is `None` if the generation is not - conditionned on the upper-level tokens. + conditioned on the upper-level tokens. metadata (`List[torch.LongTensor]`, *optional*): - List containing the metatdata tensor with the artist, genre and the lyric tokens. + List containing the metadata tensor with the artist, genre and the lyric tokens. temp (`float`, *optional*, defaults to 1.0): Sampling temperature. top_k (`int`, *optional*, defaults to 0): @@ -2237,11 +2237,11 @@ class JukeboxPrior(PreTrainedModel): hidden_states (`torch.Tensor`): Hidden states which should be raw audio metadata (`List[torch.LongTensor]`, *optional*): - List containing the metadata conditioning tensorwith the lyric and the metadata tokens. + List containing the metadata conditioning tensor with the lyric and the metadata tokens. decode (`bool`, *optional*, defaults to `False`): Whether or not to decode the encoded to tokens. get_preds (`bool`, *optional*, defaults to `False`): - Whether or not to return the actual predicitons of the model. + Whether or not to return the actual predictions of the model. """ batch_size = hidden_states.shape[0] music_tokens, *music_tokens_conds = self.encode(hidden_states, bs_chunks=batch_size) @@ -2466,10 +2466,10 @@ class JukeboxModel(JukeboxPreTrainedModel): metas (`List[Any]`, *optional*): Metadatas used to generate the `labels` chunk_size (`int`, *optional*, defaults to 32): - Size of a chunk of audio, used to fill up the memory in chuncks to prevent OOM erros. Bigger chunks + Size of a chunk of audio, used to fill up the memory in chunks to prevent OOM errors. Bigger chunks means faster memory filling but more consumption. sampling_temperature (`float`, *optional*, defaults to 0.98): - Temperature used to ajust the randomness of the sampling. + Temperature used to adjust the randomness of the sampling. lower_batch_size (`int`, *optional*, defaults to 16): Maximum batch size for the lower level priors max_batch_size (`int`, *optional*, defaults to 16): diff --git a/src/transformers/models/deprecated/mctct/modeling_mctct.py b/src/transformers/models/deprecated/mctct/modeling_mctct.py index 2dd074b28c..e873111cb2 100755 --- a/src/transformers/models/deprecated/mctct/modeling_mctct.py +++ b/src/transformers/models/deprecated/mctct/modeling_mctct.py @@ -467,7 +467,7 @@ class MCTCTPreTrainedModel(PreTrainedModel): def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask): # generate creates 3D attention mask, because of the shape of input_features - # convert it to 2D if thats the case + # convert it to 2D if that's the case if len(attention_mask.shape) > 2: attention_mask = attention_mask[:, :, -1] diff --git a/src/transformers/models/deprecated/mega/modeling_mega.py b/src/transformers/models/deprecated/mega/modeling_mega.py index d5a490b01d..85d1015610 100644 --- a/src/transformers/models/deprecated/mega/modeling_mega.py +++ b/src/transformers/models/deprecated/mega/modeling_mega.py @@ -463,7 +463,7 @@ class MegaMultiDimensionDampedEma(nn.Module): prev_state (`torch.Tensor` of shape `(batch_size, config.ndim)`, *optional*): The hidden state returned from the previous timestep during incremental decoding. use_cache (`bool`, default `False`): - Whether to perfom incremental decoding; uses `prev_state` as the prior timestep, and returns the + Whether to perform incremental decoding; uses `prev_state` as the prior timestep, and returns the updated EMA hidden state for use in the next step Returns: @@ -652,7 +652,7 @@ class MegaGatedCrossAttention(nn.Module): output_attentions (`bool`, defaults to `False`): Whether or not to return the cross-attention weights. use_cache (`bool`, defaults to `False`): - Whether to perfom incremental decoding; uses `prev_state` as the prior timestep, and returns the + Whether to perform incremental decoding; uses `prev_state` as the prior timestep, and returns the updated EMA hidden state for use in the next step Returns: @@ -936,7 +936,7 @@ class MegaMovingAverageGatedAttention(nn.Module): output_attentions (`bool`, default `False`): Whether to return self-attention weights use_cache (`bool`, default `False`): - Whether to perfom incremental decoding; uses `past_key_values` as prior state, and returns the updated + Whether to perform incremental decoding; uses `past_key_values` as prior state, and returns the updated states for use in the next step Returns: @@ -1214,7 +1214,7 @@ class MegaBlock(nn.Module): output_attentions (`bool`, default `False`): Whether to return self-attention weights use_cache (`bool`, default `False`): - Whether to perfom incremental decoding; uses `past_key_value` as prior state, and returns the updated + Whether to perform incremental decoding; uses `past_key_value` as prior state, and returns the updated states for use in the next step Returns: diff --git a/src/transformers/models/deprecated/realm/retrieval_realm.py b/src/transformers/models/deprecated/realm/retrieval_realm.py index b3c084f1d2..b5e47abb11 100644 --- a/src/transformers/models/deprecated/realm/retrieval_realm.py +++ b/src/transformers/models/deprecated/realm/retrieval_realm.py @@ -76,7 +76,7 @@ class RealmRetriever: Parameters: block_records (`np.ndarray`): - A numpy array which cantains evidence texts. + A numpy array which contains evidence texts. tokenizer ([`RealmTokenizer`]): The tokenizer to encode retrieved texts. """ diff --git a/src/transformers/models/deprecated/realm/tokenization_realm.py b/src/transformers/models/deprecated/realm/tokenization_realm.py index 70e69bc4bc..5c3c7a196f 100644 --- a/src/transformers/models/deprecated/realm/tokenization_realm.py +++ b/src/transformers/models/deprecated/realm/tokenization_realm.py @@ -516,7 +516,7 @@ class WordpieceTokenizer: Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary. - For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`. + For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. Args: text: A single token or whitespace separated tokens. This should have diff --git a/src/transformers/models/deprecated/retribert/tokenization_retribert.py b/src/transformers/models/deprecated/retribert/tokenization_retribert.py index 35a1874aa0..7f2206f0d9 100644 --- a/src/transformers/models/deprecated/retribert/tokenization_retribert.py +++ b/src/transformers/models/deprecated/retribert/tokenization_retribert.py @@ -457,7 +457,7 @@ class WordpieceTokenizer: Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary. - For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`. + For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. Args: text: A single token or whitespace separated tokens. This should have diff --git a/src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py index 23972deae2..7d82659b5a 100644 --- a/src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py +++ b/src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py @@ -49,7 +49,7 @@ class TransfoXLConfig(PretrainedConfig): d_inner (`int`, *optional*, defaults to 4096): Inner dimension in FF div_val (`int`, *optional*, defaults to 4): - Divident value for adapative input and softmax + Divident value for adaptive input and softmax pre_lnorm (`boolean`, *optional*, defaults to `False`): Whether or not to apply LayerNorm to the input instead of the output in the blocks. n_layer (`int`, *optional*, defaults to 18): diff --git a/src/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py b/src/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py index bbbfac9031..6e1c49d708 100644 --- a/src/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py +++ b/src/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py @@ -46,7 +46,7 @@ class TvltFeatureExtractor(SequenceFeatureExtractor): sampling_rate (`int`, *optional*, defaults to 44100): The sampling rate at which the audio files should be digitalized expressed in Hertz (Hz). hop_length_to_sampling_rate (`int`, *optional*, defaults to 86): - Hop length is length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients. + Hop length is length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients. For example, with sampling rate 44100, the hop length is 512, with 44100 / 512 = 86 n_fft (`int`, *optional*, defaults to 2048): Size of the Fourier transform. @@ -141,7 +141,7 @@ class TvltFeatureExtractor(SequenceFeatureExtractor): - For TvltTransformer models, `attention_mask` should alwys be passed for batched inference, to avoid + For TvltTransformer models, `attention_mask` should always be passed for batched inference, to avoid subtle bugs. diff --git a/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py b/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py index 624e52cedd..02d78c9340 100644 --- a/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py +++ b/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py @@ -356,10 +356,10 @@ class TvltImageProcessor(BaseImageProcessor): - **pixel_mask** -- Pixel masks to be fed to a model, of shape (batch_size, num_pixel_patches). - - **pixel_values_mixed** -- Pixel values with both postive or negative to be fed to a model, of shape + - **pixel_values_mixed** -- Pixel values with both positive or negative to be fed to a model, of shape (batch_size, num_channels, height, width). - - **pixel_mask_mixed** -- Pixel masks with both postive or negative to be fed to a model, of shape + - **pixel_mask_mixed** -- Pixel masks with both positive or negative to be fed to a model, of shape (batch_size, num_pixel_patches). """ do_resize = do_resize if do_resize is not None else self.do_resize diff --git a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py index cd87217f05..b509d60d12 100644 --- a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py +++ b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py @@ -91,7 +91,7 @@ class ModuleTransfer: for dest_m, src_m in zip(dest_traced, src_traced): dest_m.load_state_dict(src_m.state_dict()) if self.verbose == 1: - print(f"Transfered from={src_m} to={dest_m}") + print(f"Transferred from={src_m} to={dest_m}") def copy_parameters(from_model: nn.Module, our_model: nn.Module) -> nn.Module: diff --git a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py b/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py index d43ff7f40d..f07a76b2b2 100644 --- a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py +++ b/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py @@ -87,7 +87,7 @@ def create_rename_keys(config): rename_keys.append(("pretrained.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight")) rename_keys.append(("pretrained.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias")) - # Transfomer encoder + # Transformer encoder for i in range(config.backbone_config.num_hidden_layers): rename_keys.append((f"pretrained.blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1")) rename_keys.append((f"pretrained.blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1")) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index c26bf484f5..d1aa64d2f6 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -902,14 +902,14 @@ class DepthProFeatureFusionStage(nn.Module): for _ in range(self.num_layers - 1): self.intermediate.append(DepthProFeatureFusionLayer(config)) - # final layer doesnot require deconvolution + # final layer does not require deconvolution self.final = DepthProFeatureFusionLayer(config, use_deconv=False) def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]: if self.num_layers != len(hidden_states): raise ValueError( f"num_layers={self.num_layers} in DepthProFeatureFusionStage" - f"doesnot match len(hidden_states)={len(hidden_states)}" + f"does not match len(hidden_states)={len(hidden_states)}" ) fused_hidden_states = [] diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index cb47f58bda..989d68e75d 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -138,7 +138,7 @@ class DetrObjectDetectionOutput(ModelOutput): possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the unnormalized bounding boxes. auxiliary_outputs (`list[Dict]`, *optional*): - Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and `pred_boxes`) for each decoder layer. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): @@ -1278,8 +1278,8 @@ class DetrModel(DetrPreTrainedModel): flattened_mask = mask.flatten(1) # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder - # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size) - # flattened_mask is a Tensor of shape (batch_size, heigth*width) + # flattened_features is a Tensor of shape (batch_size, height*width, hidden_size) + # flattened_mask is a Tensor of shape (batch_size, height*width) if encoder_outputs is None: encoder_outputs = self.encoder( inputs_embeds=flattened_features, @@ -1603,8 +1603,8 @@ class DetrForSegmentation(DetrPreTrainedModel): flattened_mask = mask.flatten(1) # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder - # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size) - # flattened_mask is a Tensor of shape (batch_size, heigth*width) + # flattened_features is a Tensor of shape (batch_size, height*width, hidden_size) + # flattened_mask is a Tensor of shape (batch_size, height*width) if encoder_outputs is None: encoder_outputs = self.detr.model.encoder( inputs_embeds=flattened_features, @@ -1739,7 +1739,7 @@ class DetrMaskHeadSmallConv(nn.Module): nn.init.constant_(m.bias, 0) def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]): - # here we concatenate x, the projected feature map, of shape (batch_size, d_model, heigth/32, width/32) with + # here we concatenate x, the projected feature map, of shape (batch_size, d_model, height/32, width/32) with # the bbox_mask = the attention maps of shape (batch_size, n_queries, n_heads, height/32, width/32). # We expand the projected feature map to match the number of heads. x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1) diff --git a/src/transformers/models/distilbert/tokenization_distilbert.py b/src/transformers/models/distilbert/tokenization_distilbert.py index c894211a2e..e5f1a20ae5 100644 --- a/src/transformers/models/distilbert/tokenization_distilbert.py +++ b/src/transformers/models/distilbert/tokenization_distilbert.py @@ -475,7 +475,7 @@ class WordpieceTokenizer: Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary. - For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`. + For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. Args: text: A single token or whitespace separated tokens. This should have diff --git a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py b/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py index 367aff7f90..21aa2b4897 100644 --- a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py +++ b/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py @@ -123,7 +123,7 @@ def create_rename_keys_backbone(config): rename_keys.append(("patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight")) rename_keys.append(("patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias")) - # Transfomer encoder + # Transformer encoder for i in range(config.backbone_config.num_hidden_layers): # layernorms rename_keys.append((f"blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight")) diff --git a/src/transformers/models/dpt/convert_dpt_beit_to_hf.py b/src/transformers/models/dpt/convert_dpt_beit_to_hf.py index 3a576d772f..c4ff8a3eb7 100644 --- a/src/transformers/models/dpt/convert_dpt_beit_to_hf.py +++ b/src/transformers/models/dpt/convert_dpt_beit_to_hf.py @@ -77,7 +77,7 @@ def create_rename_keys(config): rename_keys.append(("pretrained.model.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight")) rename_keys.append(("pretrained.model.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias")) - # Transfomer encoder + # Transformer encoder for i in range(config.backbone_config.num_hidden_layers): rename_keys.append((f"pretrained.model.blocks.{i}.gamma_1", f"backbone.encoder.layer.{i}.lambda_1")) rename_keys.append((f"pretrained.model.blocks.{i}.gamma_2", f"backbone.encoder.layer.{i}.lambda_2")) diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py index 095cd1a48b..9a35ee4b4a 100644 --- a/src/transformers/models/dpt/image_processing_dpt.py +++ b/src/transformers/models/dpt/image_processing_dpt.py @@ -111,22 +111,22 @@ class DPTImageProcessor(BaseImageProcessor): Args: do_resize (`bool`, *optional*, defaults to `True`): - Whether to resize the image's (height, width) dimensions. Can be overidden by `do_resize` in `preprocess`. + Whether to resize the image's (height, width) dimensions. Can be overridden by `do_resize` in `preprocess`. size (`Dict[str, int]` *optional*, defaults to `{"height": 384, "width": 384}`): - Size of the image after resizing. Can be overidden by `size` in `preprocess`. + Size of the image after resizing. Can be overridden by `size` in `preprocess`. resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): - Defines the resampling filter to use if resizing the image. Can be overidden by `resample` in `preprocess`. + Defines the resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`. keep_aspect_ratio (`bool`, *optional*, defaults to `False`): If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can - be overidden by `keep_aspect_ratio` in `preprocess`. + be overridden by `keep_aspect_ratio` in `preprocess`. ensure_multiple_of (`int`, *optional*, defaults to 1): - If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overidden + If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden by `ensure_multiple_of` in `preprocess`. do_rescale (`bool`, *optional*, defaults to `True`): - Whether to rescale the image by the specified scale `rescale_factor`. Can be overidden by `do_rescale` in + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in `preprocess`. rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): - Scale factor to use if rescaling the image. Can be overidden by `rescale_factor` in `preprocess`. + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in `preprocess`. do_normalize (`bool`, *optional*, defaults to `True`): Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` method. diff --git a/src/transformers/models/efficientnet/modeling_efficientnet.py b/src/transformers/models/efficientnet/modeling_efficientnet.py index 9e0b890729..7943b5d117 100644 --- a/src/transformers/models/efficientnet/modeling_efficientnet.py +++ b/src/transformers/models/efficientnet/modeling_efficientnet.py @@ -383,7 +383,7 @@ class EfficientNetBlock(nn.Module): class EfficientNetEncoder(nn.Module): r""" - Forward propogates the embeddings through each EfficientNet block. + Forward propagates the embeddings through each EfficientNet block. Args: config ([`EfficientNetConfig`]): diff --git a/src/transformers/models/electra/modeling_flax_electra.py b/src/transformers/models/electra/modeling_flax_electra.py index 77a445e6cc..4bf75ff33e 100644 --- a/src/transformers/models/electra/modeling_flax_electra.py +++ b/src/transformers/models/electra/modeling_flax_electra.py @@ -1237,7 +1237,7 @@ class FlaxElectraSequenceSummary(nn.Module): Returns: `jnp.ndarray`: The summary of the sequence hidden states. """ - # NOTE: this doest "first" type summary always + # NOTE: this does "first" type summary always output = hidden_states[:, 0] output = self.first_dropout(output, deterministic=deterministic) output = self.summary(output) diff --git a/src/transformers/models/electra/tokenization_electra.py b/src/transformers/models/electra/tokenization_electra.py index 3b21527e6c..365274d7ed 100644 --- a/src/transformers/models/electra/tokenization_electra.py +++ b/src/transformers/models/electra/tokenization_electra.py @@ -464,7 +464,7 @@ class WordpieceTokenizer: Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary. - For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`. + For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. Args: text: A single token or whitespace separated tokens. This should have diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py index f5a626a21f..d2d0b56a8d 100644 --- a/src/transformers/models/emu3/modeling_emu3.py +++ b/src/transformers/models/emu3/modeling_emu3.py @@ -1780,7 +1780,7 @@ EMU3_INPUTS_DOCSTRING = r""" class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin): _tied_weights_keys = ["text_model.lm_head.weight"] - _supports_static_cache = False # `get_image_tokens()`, called when `pixel_values` is passed, is not compileable + _supports_static_cache = False # `get_image_tokens()`, called when `pixel_values` is passed, is not compilable def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/emu3/modular_emu3.py b/src/transformers/models/emu3/modular_emu3.py index c4e35e71d2..52d32dbdee 100644 --- a/src/transformers/models/emu3/modular_emu3.py +++ b/src/transformers/models/emu3/modular_emu3.py @@ -1123,7 +1123,7 @@ class Emu3ForCausalLM(LlamaForCausalLM, Emu3PreTrainedModel, GenerationMixin): class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin): _tied_weights_keys = ["text_model.lm_head.weight"] - _supports_static_cache = False # `get_image_tokens()`, called when `pixel_values` is passed, is not compileable + _supports_static_cache = False # `get_image_tokens()`, called when `pixel_values` is passed, is not compilable def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/encodec/configuration_encodec.py b/src/transformers/models/encodec/configuration_encodec.py index 77fd67727d..c00b5ebd1c 100644 --- a/src/transformers/models/encodec/configuration_encodec.py +++ b/src/transformers/models/encodec/configuration_encodec.py @@ -38,7 +38,7 @@ class EncodecConfig(PretrainedConfig): Args: target_bandwidths (`List[float]`, *optional*, defaults to `[1.5, 3.0, 6.0, 12.0, 24.0]`): - The range of diffent bandwiths the model can encode audio with. + The range of different bandwidths the model can encode audio with. sampling_rate (`int`, *optional*, defaults to 24000): The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz). audio_channels (`int`, *optional*, defaults to 1): diff --git a/src/transformers/models/encodec/modeling_encodec.py b/src/transformers/models/encodec/modeling_encodec.py index 670ac99e03..ba699d745e 100644 --- a/src/transformers/models/encodec/modeling_encodec.py +++ b/src/transformers/models/encodec/modeling_encodec.py @@ -503,7 +503,7 @@ ENCODEC_START_DOCSTRING = r""" ENCODEC_INPUTS_DOCSTRING = r""" Args: input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`, *optional*): - Raw audio input converted to Float and padded to the approriate length in order to be encoded using chunks + Raw audio input converted to Float and padded to the appropriate length in order to be encoded using chunks of length self.chunk_length and a stride of `config.chunk_stride`. padding_mask (`torch.BoolTensor` of shape `(batch_size, channels, sequence_length)`, *optional*): Mask to avoid computing scaling factors on padding token indices (can we avoid computing conv on these+). diff --git a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py index a5eff83e55..af57b2596c 100644 --- a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py @@ -78,7 +78,7 @@ class EncoderDecoderConfig(PretrainedConfig): super().__init__(**kwargs) if "encoder" not in kwargs or "decoder" not in kwargs: raise ValueError( - f"A configuraton of type {self.model_type} cannot be instantiated because " + f"A configuration of type {self.model_type} cannot be instantiated because " f"both `encoder` and `decoder` sub-configurations were not passed, only {kwargs}" ) encoder_config = kwargs.pop("encoder") diff --git a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py index bdc589484c..ccb0aa0a6d 100644 --- a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py @@ -784,7 +784,7 @@ class FlaxEncoderDecoderModel(FlaxPreTrainedModel): [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. model_args (remaining positional arguments, *optional*): - All remaning positional arguments will be passed to the underlying model's `__init__` method. + All remaining positional arguments will be passed to the underlying model's `__init__` method. kwargs (remaining dictionary of keyword arguments, *optional*): Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py index a5abafc361..9926f8d10f 100644 --- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py @@ -341,7 +341,7 @@ class TFEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLoss): `decoder_from_pt` should be set to `True`. model_args (remaining positional arguments, *optional*): - All remaning positional arguments will be passed to the underlying model's `__init__` method. + All remaining positional arguments will be passed to the underlying model's `__init__` method. kwargs (remaining dictionary of keyword arguments, *optional*): Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py index 74076eddf2..8526771d79 100644 --- a/src/transformers/models/flava/modeling_flava.py +++ b/src/transformers/models/flava/modeling_flava.py @@ -604,7 +604,7 @@ class FlavaLayer(nn.Module): self.intermediate = FlavaIntermediate(config) self.output = FlavaOutput(config) - # TODO: Check fp32 layer norm possiblity + # TODO: Check fp32 layer norm possibility self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) diff --git a/src/transformers/models/funnel/tokenization_funnel.py b/src/transformers/models/funnel/tokenization_funnel.py index 8cb6f1af0e..d1c624a88d 100644 --- a/src/transformers/models/funnel/tokenization_funnel.py +++ b/src/transformers/models/funnel/tokenization_funnel.py @@ -495,7 +495,7 @@ class WordpieceTokenizer: Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary. - For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`. + For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. Args: text: A single token or whitespace separated tokens. This should have diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index 78a40f6d53..314fba427a 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -329,7 +329,7 @@ class GPT2Attention(nn.Module): else: # Attention functions are consistent with previous equivalent attention classes, however they do not support some options # (e.g. layer scaling, head mask) that eager supports. These implementations are thus equivalent to previous code, but - # not necessarily to eager (if mentionned options are provided). + # not necessarily to eager (if mentioned options are provided). attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] if using_eager and self.reorder_and_upcast_attn: diff --git a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py index 19b0fd2375..194fff7dd6 100644 --- a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py @@ -195,7 +195,7 @@ class GPTNeoXJapaneseTokenizer(PreTrainedTokenizer): class SubWordJapaneseTokenizer: """ - https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT Lisence according to the + https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT License according to the original repository. MIT License diff --git a/src/transformers/models/granitemoe/configuration_granitemoe.py b/src/transformers/models/granitemoe/configuration_granitemoe.py index 9ef029f95c..79f3d72cc8 100644 --- a/src/transformers/models/granitemoe/configuration_granitemoe.py +++ b/src/transformers/models/granitemoe/configuration_granitemoe.py @@ -97,9 +97,9 @@ class GraniteMoeConfig(PretrainedConfig): num_local_experts (`int`, *optional*, defaults to 8): total number of experts num_experts_per_tok (`int`, *optional*, defaults to 2): number of experts per token output_router_logits (`bool`, *optional*, defaults to `False`): - Whether or not the router logits should be returned by the model. Enabeling this will also + Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss. - router_aux_loss_coef (`float`, *optional*, defaults to 0.001): router auxialiary loss coefficient + router_aux_loss_coef (`float`, *optional*, defaults to 0.001): router auxiliary loss coefficient ```python >>> from transformers import GraniteMoeModel, GraniteMoeConfig diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py index d417535db7..26496f7d0e 100644 --- a/src/transformers/models/granitemoe/modeling_granitemoe.py +++ b/src/transformers/models/granitemoe/modeling_granitemoe.py @@ -237,11 +237,12 @@ class GraniteMoeParallelExperts(nn.Module): def __init__(self, num_experts: int, input_size: int, output_size: int) -> None: """ Initialize the GraniteMoeParallelExperts module. - The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's comptible with + The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py) used in vllm. + Args: num_experts (int): Number of experts. @@ -259,11 +260,13 @@ class GraniteMoeParallelExperts(nn.Module): def forward(self, inputs, expert_size): """ Forward pass of the GraniteMoeParallelExperts module. + Args: inputs (Tensor): Input tensor. expert_size: Expert size information. + Returns: Tensor: Output tensor. """ diff --git a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py index 49df8e0bdf..32b55c69f3 100644 --- a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py @@ -97,9 +97,9 @@ class GraniteMoeSharedConfig(PretrainedConfig): num_local_experts (`int`, *optional*, defaults to 8): total number of experts num_experts_per_tok (`int`, *optional*, defaults to 2): number of experts per token output_router_logits (`bool`, *optional*, defaults to `False`): - Whether or not the router logits should be returned by the model. Enabeling this will also + Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss. - router_aux_loss_coef (`float`, *optional*, defaults to 0.001): router auxialiary loss coefficient + router_aux_loss_coef (`float`, *optional*, defaults to 0.001): router auxiliary loss coefficient shared_intermediate_size (`int`, *optional*, defaults to 0): intermediate size for shared experts. 0 implies no shared experts. diff --git a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py index 59d679fb3c..2ca60e007b 100644 --- a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py @@ -105,11 +105,12 @@ class GraniteMoeSharedParallelExperts(nn.Module): def __init__(self, num_experts: int, input_size: int, output_size: int) -> None: """ Initialize the GraniteMoeSharedParallelExperts module. - The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's comptible with + The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py) used in vllm. + Args: num_experts (int): Number of experts. @@ -127,11 +128,13 @@ class GraniteMoeSharedParallelExperts(nn.Module): def forward(self, inputs, expert_size): """ Forward pass of the GraniteMoeSharedParallelExperts module. + Args: inputs (Tensor): Input tensor. expert_size: Expert size information. + Returns: Tensor: Output tensor. """ diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index a238c1dc1d..6d2a2a8df2 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -262,7 +262,7 @@ class GroundingDinoObjectDetectionOutput(ModelOutput): possible padding). You can use [`~GroundingDinoProcessor.post_process_grounded_object_detection`] to retrieve the unnormalized bounding boxes. auxiliary_outputs (`List[Dict]`, *optional*): - Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and `pred_boxes`) for each decoder layer. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): @@ -2098,9 +2098,9 @@ class GroundingDinoModel(GroundingDinoPreTrainedModel): _, height, width = mask.shape valid_height = torch.sum(mask[:, :, 0], 1) valid_width = torch.sum(mask[:, 0, :], 1) - valid_ratio_heigth = valid_height.float() / height + valid_ratio_height = valid_height.float() / height valid_ratio_width = valid_width.float() / width - valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1) + valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1) return valid_ratio def generate_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes): @@ -2136,8 +2136,8 @@ class GroundingDinoModel(GroundingDinoPreTrainedModel): scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2) grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale - width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level) - proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4) + width_height = torch.ones_like(grid) * 0.05 * (2.0**level) + proposal = torch.cat((grid, width_height), -1).view(batch_size, -1, 4) proposals.append(proposal) current_position += height * width diff --git a/src/transformers/models/hubert/configuration_hubert.py b/src/transformers/models/hubert/configuration_hubert.py index 2d0d2af79e..36d41bfc57 100644 --- a/src/transformers/models/hubert/configuration_hubert.py +++ b/src/transformers/models/hubert/configuration_hubert.py @@ -106,7 +106,7 @@ class HubertConfig(PretrainedConfig): Recognition](https://arxiv.org/abs/1904.08779). mask_time_prob (`float`, *optional*, defaults to 0.05): Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking - procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If + procedure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector span to be masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`. @@ -118,7 +118,7 @@ class HubertConfig(PretrainedConfig): mask_time_min_masks'' mask_feature_prob (`float`, *optional*, defaults to 0.0): Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The - masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over + masking procedure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 9b7d260300..c243ebde9e 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -199,7 +199,7 @@ def freeze_model(model, module_exceptions=[]): module_exceptions_mapped = [mapping[m] for m in module_exceptions] for module in model.modules(): if module_exceptions and any(isinstance(module, t) for t in module_exceptions_mapped): - module.requires_grad_(True) # Explicitely setting it to true to avoid any mistakes + module.requires_grad_(True) # Explicitly setting it to true to avoid any mistakes else: module.requires_grad_(False) return model @@ -1235,7 +1235,7 @@ class IdeficsModel(IdeficsPreTrainedModel): image_attention_mask = None # cross_attention_gate: - # For any tokens attending to no images, the hidden_states comming out of the cross-attention should be zeroed-out. + # For any tokens attending to no images, the hidden_states coming out of the cross-attention should be zeroed-out. # `image_attention_mask` has shape [bsz, 1, num_images, hidden_size] with elements equal to either 0.0 or a very negative number. # If any of the elements are 0.0, then the token is attending to at least one image and the gate value is 1. Otherwise the gate value is 0. # `cross_attention_gate` has shape [bsz, seq_len] with elements equal to either 0.0 or 1.0. diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index e67a0845a7..37876080df 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -318,7 +318,7 @@ class IdeficsProcessor(ProcessorMixin): and the two images will be massaged using [`IdeficsImageProcessor.__call__`] method and placed inside the `pixel_values` dict entry of the return value. - This example also examplifies that images can be passed as objects or as text urls. It can be seen that the + This example also exemplifies that images can be passed as objects or as text urls. It can be seen that the first image is passed as object and the second one as a url. To do training do: diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py index 64d939e7b4..b72b6f0b53 100755 --- a/src/transformers/models/jamba/modeling_jamba.py +++ b/src/transformers/models/jamba/modeling_jamba.py @@ -598,7 +598,7 @@ class JambaMambaMixer(nn.Module): # projection of the input hidden states self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=self.use_bias) - # selective projection used to make dt, B and C input dependant + # selective projection used to make dt, B and C input dependent self.x_proj = nn.Linear(self.intermediate_size, self.time_step_rank + self.ssm_state_size * 2, bias=False) # time step projection (discretization) self.dt_proj = nn.Linear(self.time_step_rank, self.intermediate_size, bias=True) @@ -1547,7 +1547,7 @@ class JambaForCausalLM(JambaPreTrainedModel, GenerationMixin): use_cache=True, **kwargs, ): - # Overwitten -- has a unique cache type, `HybridMambaAttentionDynamicCache` + # Overwritten -- has a unique cache type, `HybridMambaAttentionDynamicCache` empty_past_kv = past_key_values is None diff --git a/src/transformers/models/jetmoe/configuration_jetmoe.py b/src/transformers/models/jetmoe/configuration_jetmoe.py index 4053c900bf..5846ea369d 100644 --- a/src/transformers/models/jetmoe/configuration_jetmoe.py +++ b/src/transformers/models/jetmoe/configuration_jetmoe.py @@ -57,7 +57,7 @@ class JetMoeConfig(PretrainedConfig): num_experts_per_tok (`int, *optional*, defaults to 2): The number of experts to route per-token and for MoE and MoA. output_router_logits (`bool`, *optional*, defaults to `False`): - Whether or not the router logits should be returned by the model. Enabeling this will also + Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss. aux_loss_coef (`float`, *optional*, defaults to 0.01): The coefficient for the auxiliary loss. diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index cabebb90ef..180f90676b 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -147,7 +147,7 @@ class JetMoeParallelExperts(nn.Module): def __init__(self, num_experts: int, input_size: int, output_size: int) -> None: """ Initialize the JetMoeParallelExperts module. - The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's comptible with + The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py) diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm.py b/src/transformers/models/layoutlm/tokenization_layoutlm.py index a0b61c93ac..135ca2f68a 100644 --- a/src/transformers/models/layoutlm/tokenization_layoutlm.py +++ b/src/transformers/models/layoutlm/tokenization_layoutlm.py @@ -465,7 +465,7 @@ class WordpieceTokenizer: Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary. - For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`. + For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. Args: text: A single token or whitespace separated tokens. This should have diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py index d324c1ac7d..1fa23c32ae 100644 --- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py @@ -1521,7 +1521,7 @@ class WordpieceTokenizer: Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary. - For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`. + For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. Args: text: A single token or whitespace separated tokens. This should have diff --git a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py index 8c79ae42f0..d55a9d9b0e 100644 --- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py @@ -448,7 +448,7 @@ class LayoutLMv3SelfAttention(nn.Module): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - # Use the trick of the CogView paper to stablize training + # Use the trick of the CogView paper to stabilize training attention_probs = self.cogview_attention(attention_scores) # This is actually dropping out entire tokens to attend to, which might diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index 9f80075338..be56ef1bb1 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -2467,7 +2467,7 @@ class LEDForSequenceClassification(LEDPreTrainedModel): def __init__(self, config: LEDConfig, **kwargs): warnings.warn( "The `transformers.LEDForSequenceClassification` class is deprecated and will be removed in version 5 of" - " Transformers. No actual method were provided in the original paper on how to perfom" + " Transformers. No actual method were provided in the original paper on how to perform" " sequence classification.", FutureWarning, ) diff --git a/src/transformers/models/llama4/convert_llama4_weights_to_hf.py b/src/transformers/models/llama4/convert_llama4_weights_to_hf.py index 75b0ab8363..923d9ffc63 100644 --- a/src/transformers/models/llama4/convert_llama4_weights_to_hf.py +++ b/src/transformers/models/llama4/convert_llama4_weights_to_hf.py @@ -239,7 +239,7 @@ def write_model( config_kwargs = {} if params["use_scaled_rope"]: - # some constans from original code + # some constants from original code rope_scaling = { "rope_type": "llama3", "factor": 8.0, @@ -288,7 +288,7 @@ def write_model( for_llm_compressor=_OFFLINE_QUANT_COMPATIBLE, **config_kwargs, ) - # default vision config frmo params + # default vision config from params vision_params = params["vision_args"] vision_dim = vision_params["dim"] diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py index d1611a2fe0..0959199c2e 100644 --- a/src/transformers/models/llama4/modeling_llama4.py +++ b/src/transformers/models/llama4/modeling_llama4.py @@ -857,7 +857,7 @@ class Llama4TextModel(Llama4PreTrainedModel): '?' : 5 ⬚ ⬚ ⬚ ■ ■ ■ | If the chunk size is 3. - This can just be appplied over the already created attention mask + This can just be applied over the already created attention mask """ arange_vector = torch.arange(start, end, device=device) block_pos = torch.abs( @@ -894,7 +894,7 @@ class Llama4TextModel(Llama4PreTrainedModel): dtype (`torch.dtype`): The dtype to use for the 4D attention mask. device (`torch.device`): - The device to plcae the 4D attention mask on. + The device to place the 4D attention mask on. cache_position (`torch.Tensor`): Indices depicting the position of the input sequence tokens in the sequence. batch_size (`torch.Tensor`): diff --git a/src/transformers/models/llava/convert_llava_weights_to_hf.py b/src/transformers/models/llava/convert_llava_weights_to_hf.py index dafbf8bf2f..33dbe37d58 100644 --- a/src/transformers/models/llava/convert_llava_weights_to_hf.py +++ b/src/transformers/models/llava/convert_llava_weights_to_hf.py @@ -72,7 +72,7 @@ def load_original_state_dict(model_id): for key in f.keys(): original_state_dict[key] = f.get_tensor(key) - # tied wieghts so lm.head is not saved. Let's clone to load state dict + # tied weights so lm.head is not saved. Let's clone to load state dict if "lm_head.weight" not in original_state_dict: original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone() @@ -127,7 +127,7 @@ def convert_llava_llama_to_hf(text_model_id, vision_model_id, output_hub_path, o vision_config=vision_config, ) - # llms-lab interleeave models do not use any selection startegy except for last hidden state + # llms-lab interleave models do not use any selection strategy except for last hidden state if "Qwen" in text_model_id: config.image_token_id = 151646 if "siglip" in vision_model_id: diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py index 72a61bff71..be98c0a3e5 100644 --- a/src/transformers/models/llava/processing_llava.py +++ b/src/transformers/models/llava/processing_llava.py @@ -53,7 +53,7 @@ class LlavaProcessor(ProcessorMixin): Patch size from the vision tower. vision_feature_select_strategy (`str`, *optional*): The feature selection strategy used to select the vision feature from the vision backbone. - Shoudl be same as in model's config + Should be same as in model's config chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. image_token (`str`, *optional*, defaults to `""`): diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py index c212a549fc..63246e8a53 100644 --- a/src/transformers/models/llava_next/image_processing_llava_next.py +++ b/src/transformers/models/llava_next/image_processing_llava_next.py @@ -688,7 +688,7 @@ class LlavaNextImageProcessor(BaseImageProcessor): image_sizes = [get_image_size(image, channel_dim=input_data_format) for image in images] for image in images: # convert image into a list of patches - # we intentially use the same data format as the input data format + # we intentionally use the same data format as the input data format image_patches = self.get_image_patches( image, image_grid_pinpoints, diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py index 78175adc21..e1409b5d1d 100644 --- a/src/transformers/models/llava_next/processing_llava_next.py +++ b/src/transformers/models/llava_next/processing_llava_next.py @@ -56,7 +56,7 @@ class LlavaNextProcessor(ProcessorMixin): Patch size from the vision tower. vision_feature_select_strategy (`str`, *optional*): The feature selection strategy used to select the vision feature from the vision backbone. - Shoudl be same as in model's config + Should be same as in model's config chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. image_token (`str`, *optional*, defaults to `""`): diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index c7ff0a1d7a..12ba442175 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -64,7 +64,7 @@ class LlavaNextVideoProcessor(ProcessorMixin): Patch size from the vision tower. vision_feature_select_strategy (`str`, *optional*): The feature selection strategy used to select the vision feature from the vision backbone. - Shoudl be same as in model's config + Should be same as in model's config video_token (`str`, *optional*, defaults to `"