From 44715225e3169a5b57645c3a81f8d791cf67154b Mon Sep 17 00:00:00 2001 From: Afanti Date: Thu, 27 Mar 2025 00:09:48 +0800 Subject: [PATCH] fix typos in the code comments and error messages (#36993) * chore: enhance code comments * chore: enhance code comments * chore: enhance code comments * chore: enhance code comments * chore: enhance code comments * chore: enhance code comments * chore: enhance code comments --- src/transformers/models/altclip/modeling_altclip.py | 2 +- src/transformers/models/aria/modeling_aria.py | 2 +- src/transformers/models/aria/modular_aria.py | 2 +- src/transformers/models/clip/modeling_clip.py | 2 +- src/transformers/models/clipseg/modeling_clipseg.py | 2 +- src/transformers/models/esm/modeling_esmfold.py | 2 +- src/transformers/models/esm/openfold_utils/rigid_utils.py | 8 ++++---- .../models/falcon_mamba/configuration_falcon_mamba.py | 2 +- .../models/falcon_mamba/modeling_falcon_mamba.py | 4 ++-- src/transformers/models/git/modeling_git.py | 2 +- src/transformers/models/idefics/vision.py | 2 +- src/transformers/models/kosmos2/modeling_kosmos2.py | 2 +- src/transformers/models/longt5/modeling_longt5.py | 2 +- src/transformers/models/mamba/modeling_mamba.py | 4 ++-- src/transformers/models/mt5/modeling_mt5.py | 2 +- src/transformers/models/pix2struct/modeling_pix2struct.py | 2 +- src/transformers/models/pop2piano/modeling_pop2piano.py | 2 +- ...ch_transformers_original_flax_checkpoint_to_pytorch.py | 2 +- .../switch_transformers/modeling_switch_transformers.py | 6 +++--- .../models/t5/convert_t5x_checkpoint_to_flax.py | 2 +- src/transformers/models/t5/modeling_t5.py | 2 +- src/transformers/models/udop/modeling_udop.py | 2 +- src/transformers/models/umt5/modeling_umt5.py | 2 +- .../models/vivit/convert_vivit_flax_to_pytorch.py | 2 +- src/transformers/models/x_clip/modeling_x_clip.py | 2 +- 25 files changed, 32 insertions(+), 32 deletions(-) diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 8a6c845efa..90d8aa631e 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -798,7 +798,7 @@ class AltCLIPAttention(nn.Module): attn_weights = nn.functional.softmax(attn_weights, dim=-1) if output_attentions: - # this operation is a bit akward, but it's required to + # this operation is a bit awkward, but it's required to # make sure that attn_weights keeps its gradient. # In order to do so, attn_weights have to reshaped # twice and have to be reused in the following diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py index e44cd709df..707eb66c42 100644 --- a/src/transformers/models/aria/modeling_aria.py +++ b/src/transformers/models/aria/modeling_aria.py @@ -267,7 +267,7 @@ def sequential_experts_gemm(token_states, expert_weights, tokens_per_expert): output = torch.zeros(num_tokens, out_features, dtype=token_states.dtype, device=token_states.device) cumsum_num_tokens = torch.cumsum(tokens_per_expert, dim=0) - # Insert zero at the begining for offset index's convenience + # Insert zero at the beginning for offset index's convenience zero_tensor = torch.zeros(1, dtype=torch.long, device=cumsum_num_tokens.device) cumsum_num_tokens = torch.cat((zero_tensor, cumsum_num_tokens)) diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index bf9d864a4c..d341970e29 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -86,7 +86,7 @@ def sequential_experts_gemm(token_states, expert_weights, tokens_per_expert): output = torch.zeros(num_tokens, out_features, dtype=token_states.dtype, device=token_states.device) cumsum_num_tokens = torch.cumsum(tokens_per_expert, dim=0) - # Insert zero at the begining for offset index's convenience + # Insert zero at the beginning for offset index's convenience zero_tensor = torch.zeros(1, dtype=torch.long, device=cumsum_num_tokens.device) cumsum_num_tokens = torch.cat((zero_tensor, cumsum_num_tokens)) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 7898a125fa..c13d9097e6 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -373,7 +373,7 @@ class CLIPAttention(nn.Module): attn_weights = nn.functional.softmax(attn_weights, dim=-1) if output_attentions: - # this operation is a bit akward, but it's required to + # this operation is a bit awkward, but it's required to # make sure that attn_weights keeps its gradient. # In order to do so, attn_weights have to reshaped # twice and have to be reused in the following diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index 3f363dd51f..b806eea5e6 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -341,7 +341,7 @@ class CLIPSegAttention(nn.Module): attn_weights = nn.functional.softmax(attn_weights, dim=-1) if output_attentions: - # this operation is a bit akward, but it's required to + # this operation is a bit awkward, but it's required to # make sure that attn_weights keeps its gradient. # In order to do so, attn_weights have to reshaped # twice and have to be reused in the following diff --git a/src/transformers/models/esm/modeling_esmfold.py b/src/transformers/models/esm/modeling_esmfold.py index 67cee99294..d06f9a7d43 100644 --- a/src/transformers/models/esm/modeling_esmfold.py +++ b/src/transformers/models/esm/modeling_esmfold.py @@ -1016,7 +1016,7 @@ class EsmFoldSelfAttention(nn.Module): use mask. Inputs: - x: batch of input sequneces (.. x L x C) mask: batch of boolean masks where 1=valid, 0=padding position (.. + x: batch of input sequences (.. x L x C) mask: batch of boolean masks where 1=valid, 0=padding position (.. x L_k) bias: batch of scalar pairwise attention biases (.. x Lq x Lk x num_heads) Outputs: diff --git a/src/transformers/models/esm/openfold_utils/rigid_utils.py b/src/transformers/models/esm/openfold_utils/rigid_utils.py index 08f5ce0a4f..4d0f2f69b3 100644 --- a/src/transformers/models/esm/openfold_utils/rigid_utils.py +++ b/src/transformers/models/esm/openfold_utils/rigid_utils.py @@ -989,10 +989,10 @@ class Rigid: def to_tensor_4x4(self) -> torch.Tensor: """ - Converts a transformation to a homogenous transformation tensor. + Converts a transformation to a homogeneous transformation tensor. Returns: - A [*, 4, 4] homogenous transformation tensor + A [*, 4, 4] homogeneous transformation tensor """ tensor = self._trans.new_zeros((*self.shape, 4, 4)) tensor[..., :3, :3] = self._rots.get_rot_mats() @@ -1003,10 +1003,10 @@ class Rigid: @staticmethod def from_tensor_4x4(t: torch.Tensor) -> Rigid: """ - Constructs a transformation from a homogenous transformation tensor. + Constructs a transformation from a homogeneous transformation tensor. Args: - t: [*, 4, 4] homogenous transformation tensor + t: [*, 4, 4] homogeneous transformation tensor Returns: T object with shape [*] """ diff --git a/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py b/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py index 4127551445..4099920f40 100644 --- a/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py +++ b/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py @@ -80,7 +80,7 @@ class FalconMambaConfig(PretrainedConfig): use_cache (`bool`, *optional*, defaults to `True`): Whether or not the cache should be used. use_mambapy (`bool`, *optional*, defaults to `False`): - Determines the fallback strategy during training if the CUDA-based official implementation of FalconMamba is not avaiable. If `True`, the falcon_mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited. + Determines the fallback strategy during training if the CUDA-based official implementation of FalconMamba is not available. If `True`, the falcon_mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited. mixer_rms_eps (`float`, *optional*, defaults to 1e-06): The RMS norm epsilon value that is used in the Mixer RMS norm for B, C and dt states. Example: diff --git a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py index d7a40ed5c5..bf8695d2a9 100644 --- a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py +++ b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py @@ -119,7 +119,7 @@ class FalconMambaMixer(nn.Module): # projection of the input hidden states self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=config.use_bias) - # selective projection used to make dt, B and C input dependant + # selective projection used to make dt, B and C input dependent self.x_proj = nn.Linear(self.intermediate_size, self.time_step_rank + self.ssm_state_size * 2, bias=False) # time step projection (discretization) self.dt_proj = nn.Linear(self.time_step_rank, self.intermediate_size, bias=True) @@ -768,7 +768,7 @@ class FalconMambaForCausalLM(FalconMambaPreTrainedModel, GenerationMixin): attention_mask: Optional[torch.LongTensor] = None, **kwargs, ): - # Overwitten -- uses `cache_params` as opposed to `past_key_values` + # Overwritten -- uses `cache_params` as opposed to `past_key_values` if use_cache: # `cache_position` should have been initialized in `generate` diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 232370ee01..0d37d04e81 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -791,7 +791,7 @@ class GitVisionAttention(nn.Module): attn_weights = nn.functional.softmax(attn_weights, dim=-1) if output_attentions: - # this operation is a bit akward, but it's required to + # this operation is a bit awkward, but it's required to # make sure that attn_weights keeps its gradient. # In order to do so, attn_weights have to reshaped # twice and have to be reused in the following diff --git a/src/transformers/models/idefics/vision.py b/src/transformers/models/idefics/vision.py index 5339b70692..c01591b5a6 100644 --- a/src/transformers/models/idefics/vision.py +++ b/src/transformers/models/idefics/vision.py @@ -237,7 +237,7 @@ class IdeficsVisionAttention(nn.Module): attn_weights = nn.functional.softmax(attn_weights, dim=-1) if output_attentions: - # this operation is a bit akward, but it's required to + # this operation is a bit awkward, but it's required to # make sure that attn_weights keeps its gradient. # In order to do so, attn_weights have to reshaped # twice and have to be reused in the following diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index c16aab776f..b74f060ced 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -543,7 +543,7 @@ class Kosmos2VisionAttention(nn.Module): attn_weights = nn.functional.softmax(attn_weights, dim=-1) if output_attentions: - # this operation is a bit akward, but it's required to + # this operation is a bit awkward, but it's required to # make sure that attn_weights keeps its gradient. # In order to do so, attn_weights have to reshaped # twice and have to be reused in the following diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py index 70ec2db49f..d076926406 100644 --- a/src/transformers/models/longt5/modeling_longt5.py +++ b/src/transformers/models/longt5/modeling_longt5.py @@ -235,7 +235,7 @@ class LongT5LayerNorm(nn.Module): def forward(self, hidden_states): # LongT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean - # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated + # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for # half-precision inputs is done in fp32 diff --git a/src/transformers/models/mamba/modeling_mamba.py b/src/transformers/models/mamba/modeling_mamba.py index 960da99890..c5389cdfd4 100644 --- a/src/transformers/models/mamba/modeling_mamba.py +++ b/src/transformers/models/mamba/modeling_mamba.py @@ -98,7 +98,7 @@ class MambaMixer(nn.Module): # projection of the input hidden states self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=config.use_bias) - # selective projection used to make dt, B and C input dependant + # selective projection used to make dt, B and C input dependent self.x_proj = nn.Linear(self.intermediate_size, self.time_step_rank + self.ssm_state_size * 2, bias=False) # time step projection (discretization) self.dt_proj = nn.Linear(self.time_step_rank, self.intermediate_size, bias=True) @@ -708,7 +708,7 @@ class MambaForCausalLM(MambaPreTrainedModel, GenerationMixin): attention_mask: Optional[torch.LongTensor] = None, **kwargs, ): - # Overwitten -- uses `cache_params` as opposed to `past_key_values` + # Overwritten -- uses `cache_params` as opposed to `past_key_values` if use_cache: # `cache_position` should have been initialized in `generate` diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py index 30938c8251..2df643b781 100644 --- a/src/transformers/models/mt5/modeling_mt5.py +++ b/src/transformers/models/mt5/modeling_mt5.py @@ -135,7 +135,7 @@ class MT5LayerNorm(nn.Module): def forward(self, hidden_states): # MT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean - # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated + # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for # half-precision inputs is done in fp32 diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py index 63c392db12..675859f310 100644 --- a/src/transformers/models/pix2struct/modeling_pix2struct.py +++ b/src/transformers/models/pix2struct/modeling_pix2struct.py @@ -72,7 +72,7 @@ class Pix2StructLayerNorm(nn.Module): def forward(self, hidden_states): # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean - # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated + # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for # half-precision inputs is done in fp32 diff --git a/src/transformers/models/pop2piano/modeling_pop2piano.py b/src/transformers/models/pop2piano/modeling_pop2piano.py index cadf71871e..2e4734aed8 100644 --- a/src/transformers/models/pop2piano/modeling_pop2piano.py +++ b/src/transformers/models/pop2piano/modeling_pop2piano.py @@ -164,7 +164,7 @@ class Pop2PianoLayerNorm(nn.Module): def forward(self, hidden_states): # Pop2Piano uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean - # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated + # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for # half-precision inputs is done in fp32 diff --git a/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py b/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py index 5937101169..71d304ea96 100644 --- a/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py +++ b/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py @@ -119,7 +119,7 @@ GIN_TO_CONFIG_MAPPING = { def convert_gin_to_config(gin_file, num_experts): - # Convert a google style config to the hugging face fromat + # Convert a google style config to the hugging face format import regex as re with open(gin_file, "r") as f: diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py index 73282a1509..d2d9929b91 100644 --- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py +++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py @@ -230,7 +230,7 @@ class SwitchTransformersLayerNorm(nn.Module): def forward(self, hidden_states): # SwitchTransformers uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean - # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated + # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for # half-precision inputs is done in fp32 @@ -297,12 +297,12 @@ class SwitchTransformersSparseMLP(nn.Module): expert the corresponding hidden states. """ - # Step 1: Get the router_mask from the router as wel as the probabilities + # Step 1: Get the router_mask from the router as well as the probabilities router_mask, router_probs, router_logits = self.router(hidden_states) expert_index = torch.argmax(router_mask, dim=-1) # The routers introduced might not always map all the tokens, to a router, which means that some hidden states - # can be unchanged from one layer to another. That is why the hidden states are cloned before updating only the seleced ones. + # can be unchanged from one layer to another. That is why the hidden states are cloned before updating only the selected ones. next_states = hidden_states.clone() diff --git a/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py b/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py index 91ac9f08a0..12498359d2 100644 --- a/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py +++ b/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py @@ -218,7 +218,7 @@ def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name, flax_dump_f flax_model.params["lm_head"]["kernel"] = t5x_model["target"]["decoder"]["logits_dense"]["kernel"] flax_model.save_pretrained(flax_dump_folder_path) - print("T5X Model was sucessfully converted!") + print("T5X Model was successfully converted!") if __name__ == "__main__": diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index 0851c0eac9..39c8101f92 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -249,7 +249,7 @@ class T5LayerNorm(nn.Module): def forward(self, hidden_states): # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean - # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated + # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for # half-precision inputs is done in fp32 diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py index 93d128562e..8238cd38a9 100644 --- a/src/transformers/models/udop/modeling_udop.py +++ b/src/transformers/models/udop/modeling_udop.py @@ -524,7 +524,7 @@ class UdopLayerNorm(nn.Module): def forward(self, hidden_states): # Udop uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean - # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated + # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for # half-precision inputs is done in fp32 diff --git a/src/transformers/models/umt5/modeling_umt5.py b/src/transformers/models/umt5/modeling_umt5.py index 07c44bef7b..8c22500a7c 100644 --- a/src/transformers/models/umt5/modeling_umt5.py +++ b/src/transformers/models/umt5/modeling_umt5.py @@ -73,7 +73,7 @@ class UMT5LayerNorm(nn.Module): def forward(self, hidden_states): # UMT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean - # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated + # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for # half-precision inputs is done in fp32 diff --git a/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py b/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py index c3075d6034..f4b5e1cfda 100644 --- a/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py +++ b/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py @@ -73,7 +73,7 @@ def transform_attention(current: np.ndarray): return transform_attention_kernel(current) else: - raise Exception(f"Invalid number of dimesions: {np.ndim(current)}") + raise Exception(f"Invalid number of dimensions: {np.ndim(current)}") def transform_attention_bias(current: np.ndarray): diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index 4f1aa6e322..2f27e19dd5 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -300,7 +300,7 @@ class XCLIPAttention(nn.Module): attn_weights = nn.functional.softmax(attn_weights, dim=-1) if output_attentions: - # this operation is a bit akward, but it's required to + # this operation is a bit awkward, but it's required to # make sure that attn_weights keeps its gradient. # In order to do so, attn_weights have to reshaped # twice and have to be reused in the following