From 06886d5a684228a695b29645993b3be55190bd9c Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Thu, 3 Nov 2022 12:05:04 -0400 Subject: [PATCH] Only resize embeddings when necessary (#20043) * Only resize embeddings when necessary * Add comment --- examples/pytorch/language-modeling/run_clm.py | 6 +++++- examples/pytorch/language-modeling/run_clm_no_trainer.py | 6 +++++- examples/pytorch/language-modeling/run_mlm.py | 6 +++++- examples/pytorch/language-modeling/run_mlm_no_trainer.py | 6 +++++- examples/pytorch/language-modeling/run_plm.py | 6 +++++- examples/pytorch/multiple-choice/run_swag_no_trainer.py | 6 +++++- examples/pytorch/question-answering/run_seq2seq_qa.py | 6 +++++- examples/pytorch/summarization/run_summarization.py | 6 +++++- .../pytorch/summarization/run_summarization_no_trainer.py | 6 +++++- .../pytorch/token-classification/run_ner_no_trainer.py | 8 +++++++- examples/pytorch/translation/run_translation.py | 6 +++++- .../pytorch/translation/run_translation_no_trainer.py | 6 +++++- examples/tensorflow/language-modeling/run_clm.py | 6 +++++- examples/tensorflow/language-modeling/run_mlm.py | 6 +++++- examples/tensorflow/summarization/run_summarization.py | 6 +++++- examples/tensorflow/token-classification/run_ner.py | 6 +++++- examples/tensorflow/translation/run_translation.py | 6 +++++- 17 files changed, 87 insertions(+), 17 deletions(-) diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index a8cec50a70..c12005500e 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -387,7 +387,11 @@ def main(): n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index af0544c3e7..c62b895249 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -378,7 +378,11 @@ def main(): logger.info("Training new model from scratch") model = AutoModelForCausalLM.from_config(config) - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 340e3a184b..ef2659213f 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -389,7 +389,11 @@ def main(): logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index dbfb50d60a..4a2d5490e1 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -383,7 +383,11 @@ def main(): logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 1cc7acfdf7..e7d5867567 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -376,7 +376,11 @@ def main(): logger.info("Training new model from scratch") model = XLNetLMHeadModel(config) - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index d5213d8c0b..607328f99d 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -398,7 +398,11 @@ def main(): logger.info("Training new model from scratch") model = AutoModelForMultipleChoice.from_config(config) - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index 98ef71d248..2cd6275c4f 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -380,7 +380,11 @@ def main(): use_auth_token=True if model_args.use_auth_token else None, ) - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) if model.config.decoder_start_token_id is None: raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index 42150343e1..197593ab4b 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -422,7 +422,11 @@ def main(): use_auth_token=True if model_args.use_auth_token else None, ) - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): if isinstance(tokenizer, MBartTokenizer): diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 2e17d53185..6e79b2f6c2 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -439,7 +439,11 @@ def main(): logger.info("Training new model from scratch") model = AutoModelForSeq2SeqLM.from_config(config) - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) if model.config.decoder_start_token_id is None: raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index a2aa750344..0f7f9ba291 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -414,7 +414,13 @@ def main(): logger.info("Training new model from scratch") model = AutoModelForTokenClassification.from_config(config) - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) # Model has labels -> use them. if model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id: diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index a684368c98..9f02d4a627 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -380,7 +380,11 @@ def main(): use_auth_token=True if model_args.use_auth_token else None, ) - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) # Set decoder_start_token_id if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index 40e980e778..eefd00b686 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -411,7 +411,11 @@ def main(): logger.info("Training new model from scratch") model = AutoModelForSeq2SeqLM.from_config(config) - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) # Set decoder_start_token_id if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py index cbe2f54f22..51087123b5 100755 --- a/examples/tensorflow/language-modeling/run_clm.py +++ b/examples/tensorflow/language-modeling/run_clm.py @@ -473,7 +473,11 @@ def main(): logger.info("Training new model from scratch") model = TFAutoModelForCausalLM.from_config(config) - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) # endregion # region TF Dataset preparation diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py index 43449a0934..680efcdbe4 100755 --- a/examples/tensorflow/language-modeling/run_mlm.py +++ b/examples/tensorflow/language-modeling/run_mlm.py @@ -489,7 +489,11 @@ def main(): logger.info("Training new model from scratch") model = TFAutoModelForMaskedLM.from_config(config) - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) # endregion # region TF Dataset preparation diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index cb94593fac..7f50d58a89 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -516,7 +516,11 @@ def main(): use_auth_token=True if model_args.use_auth_token else None, ) - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) # endregion # region Prepare TF Dataset objects diff --git a/examples/tensorflow/token-classification/run_ner.py b/examples/tensorflow/token-classification/run_ner.py index 8eb9aef92b..5e8ee5323d 100644 --- a/examples/tensorflow/token-classification/run_ner.py +++ b/examples/tensorflow/token-classification/run_ner.py @@ -385,7 +385,11 @@ def main(): logger.info("Training new model from scratch") model = TFAutoModelForTokenClassification.from_config(config) - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) # endregion # region Create TF datasets diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index 966b1f8e66..6ed216cf17 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -469,7 +469,11 @@ def main(): use_auth_token=True if model_args.use_auth_token else None, ) - model.resize_token_embeddings(len(tokenizer)) + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)): model.config.forced_bos_token_id = forced_bos_token_id # endregion