From eb644980eb7bd7ed5bc46d47df94a4c87ae31bcf Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Fri, 15 Sep 2023 11:53:39 -0400 Subject: [PATCH] Fix pad to multiple of (#25732) * nits * update the test * nits * update * fix bark * fix bark tests and allow padding to multiple of without new tokens --- src/transformers/modeling_utils.py | 12 +++-- src/transformers/models/bark/modeling_bark.py | 46 +++++++++++++++++-- tests/test_modeling_common.py | 3 ++ 3 files changed, 51 insertions(+), 10 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 41ea192cbb..a5e09d047a 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1436,7 +1436,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything. pad_to_multiple_of (`int`, *optional*): - If set will pad the embedding matrix to a multiple of the provided value. + If set will pad the embedding matrix to a multiple of the provided value.If `new_num_tokens` is set to + `None` will just pad the embedding to a multiple of `pad_to_multiple_of`. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more @@ -1447,12 +1448,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model. """ model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of) - if new_num_tokens is None: + if new_num_tokens is None and pad_to_multiple_of is None: return model_embeds # Update base model and current model config - self.config.vocab_size = new_num_tokens - self.vocab_size = new_num_tokens + self.config.vocab_size = model_embeds.weight.shape[0] + self.vocab_size = model_embeds.weight.shape[0] # Tie weights again if needed self.tie_weights() @@ -1508,7 +1509,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix vectors from the end. If not provided or `None`, just returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything. pad_to_multiple_of (`int`, *optional*): - If set will pad the embedding matrix to a multiple of the provided value. + If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to + `None` will just pad the embedding to a multiple of `pad_to_multiple_of`. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py index 25bdb28d38..bdafb63477 100644 --- a/src/transformers/models/bark/modeling_bark.py +++ b/src/transformers/models/bark/modeling_bark.py @@ -1086,21 +1086,57 @@ class BarkFineModel(BarkPreTrainedModel): ] ) self.set_input_embeddings(new_embeddings_list) - new_num_tokens = [embed.weight.shape[0] for embed in new_embeddings_list] + new_num_tokens = new_embeddings_list[0].weight.shape[0] # if word embeddings are not tied, make sure that lm head is resized as well if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings: old_lm_head_list = self.get_output_embeddings() new_lm_head_list = nn.ModuleList( - [ - self._get_resized_lm_head(old_lm_head, new_num_token) - for old_lm_head, new_num_token in zip(old_lm_head_list, new_num_tokens) - ] + [self._get_resized_lm_head(old_lm_head, new_num_tokens) for old_lm_head in old_lm_head_list] ) self.set_output_embeddings(new_lm_head_list) return self.get_input_embeddings() + def resize_token_embeddings( + self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None + ) -> nn.Embedding: + """ + Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`. + + Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method. + + Arguments: + new_num_tokens (`int`, *optional*): + The number of new tokens in the embedding matrix. Increasing the size will add newly initialized + vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just + returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything. + pad_to_multiple_of (`int`, *optional*): + If set will pad the embedding matrix to a multiple of the provided value. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more + details about this, or help on choosing the correct value for resizing, refer to this guide: + https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc + + Return: + `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model. + """ + model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of) + if new_num_tokens is None and pad_to_multiple_of is None: + return model_embeds + + # Update base model and current model config + self.config.output_vocab_size = model_embeds[0].weight.shape[0] + self.config.vocab_size = model_embeds[0].weight.shape[0] + self.output_vocab_size = model_embeds[0].weight.shape[0] + self.vocab_size = model_embeds[0].weight.shape[0] + + # Tie weights again if needed + self.tie_weights() + + return model_embeds + def tie_weights(self): """ Tie the weights between the input embeddings list and the output embeddings list. diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 6f8866cf7d..6e144ed476 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -1431,6 +1431,9 @@ class ModelTesterMixin: model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64) self.assertTrue(model_embed.weight.shape[0] // 64, 0) + self.assertTrue(model_embed.weight.shape[0], model.config.vocab_size) + self.assertTrue(model.config.vocab_size, model.vocab_size) + model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64) self.assertTrue(model_embed.weight.shape[0] // 64, 0)