🔴 🚨 Resizing tokens embeddings: initialize from old embeddings' normal distribution. (#33325)

* intilize new embeddings from normal distrib

* Fix typo in comments

* Fix typo in comments

* Fix style

* Fix variables naming

* Add tests

* Fix style

* code consistency nit

* Add deepspeed support

* Add deepspeed support

* Conver embeddings weights to float32 before computations

* Add deepspeed tests

* Cover when vocab_size is smaller than embedding_size

* Style fix

* Add tests for vocab_size smaller than hiddin_size

* Style fix

* Nits in tests

* Nits in tests

* Check for deepspeed before importing it

* Increase vocab_size for positive definite covariance matrix test

* Add warning

* Add multivariate_resizing flag and implement resizing for lm_heads

* Fix typo

* Fix wrong bias indexing

* Fix bias is zero check

* remove multivariate_resizing flag from tests

* Intialize bias from old bias normal distribution

* Fixup

* Code usability

* Use mean_resizing instead of multivariate_resizing

* Fix up

* Fix comments and docs
This commit is contained in:
Mohamed Abu El-Nasr
2024-10-04 17:29:55 +03:00
committed by GitHub
parent b916efcb3c
commit 78ef58325c
2 changed files with 314 additions and 24 deletions

View File

@@ -2049,7 +2049,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
return list(_no_split_modules) return list(_no_split_modules)
def resize_token_embeddings( def resize_token_embeddings(
self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None self,
new_num_tokens: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
mean_resizing: bool = True,
) -> nn.Embedding: ) -> nn.Embedding:
""" """
Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`. Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
@@ -2069,11 +2072,19 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
`>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
details about this, or help on choosing the correct value for resizing, refer to this guide: details about this, or help on choosing the correct value for resizing, refer to this guide:
https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
mean_resizing (`bool`):
Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.
Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
where the generated tokens' probabilities won't be affected by the added embeddings because initializing the new embeddings with the
old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html
Return: Return:
`torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model. `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
""" """
model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of) model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing)
if new_num_tokens is None and pad_to_multiple_of is None: if new_num_tokens is None and pad_to_multiple_of is None:
return model_embeds return model_embeds
@@ -2096,9 +2107,11 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
return model_embeds return model_embeds
def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None): def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None, mean_resizing=True):
old_embeddings = self.get_input_embeddings() old_embeddings = self.get_input_embeddings()
new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of) new_embeddings = self._get_resized_embeddings(
old_embeddings, new_num_tokens, pad_to_multiple_of, mean_resizing
)
if hasattr(old_embeddings, "_hf_hook"): if hasattr(old_embeddings, "_hf_hook"):
hook = old_embeddings._hf_hook hook = old_embeddings._hf_hook
add_hook_to_module(new_embeddings, hook) add_hook_to_module(new_embeddings, hook)
@@ -2121,9 +2134,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings: if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings:
old_lm_head = self.get_output_embeddings() old_lm_head = self.get_output_embeddings()
if isinstance(old_lm_head, torch.nn.Embedding): if isinstance(old_lm_head, torch.nn.Embedding):
new_lm_head = self._get_resized_embeddings(old_lm_head, new_num_tokens) new_lm_head = self._get_resized_embeddings(old_lm_head, new_num_tokens, mean_resizing=mean_resizing)
else: else:
new_lm_head = self._get_resized_lm_head(old_lm_head, new_num_tokens) new_lm_head = self._get_resized_lm_head(old_lm_head, new_num_tokens, mean_resizing=mean_resizing)
if hasattr(old_lm_head, "_hf_hook"): if hasattr(old_lm_head, "_hf_hook"):
hook = old_lm_head._hf_hook hook = old_lm_head._hf_hook
add_hook_to_module(new_lm_head, hook) add_hook_to_module(new_lm_head, hook)
@@ -2138,6 +2151,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
old_embeddings: nn.Embedding, old_embeddings: nn.Embedding,
new_num_tokens: Optional[int] = None, new_num_tokens: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None, pad_to_multiple_of: Optional[int] = None,
mean_resizing: bool = True,
) -> nn.Embedding: ) -> nn.Embedding:
""" """
Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly
@@ -2160,6 +2174,14 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
`>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
details about this, or help on choosing the correct value for resizing, refer to this guide: details about this, or help on choosing the correct value for resizing, refer to this guide:
https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
mean_resizing (`bool`):
Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.
Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
where the generated tokens' probabilities will not be affected by the added embeddings because initializing the new embeddings with the
old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html
Return: Return:
@@ -2218,9 +2240,33 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
dtype=old_embeddings.weight.dtype, dtype=old_embeddings.weight.dtype,
) )
# initialize all new embeddings (in particular added tokens) if new_num_tokens > old_num_tokens and not mean_resizing:
# initialize new embeddings (in particular added tokens) with a mean of 0 and std equals `config.initializer_range`.
self._init_weights(new_embeddings) self._init_weights(new_embeddings)
elif new_num_tokens > old_num_tokens and mean_resizing:
# initialize new embeddings (in particular added tokens). The new embeddings will be initialized
# from a multivariate normal distribution that has old embeddings' mean and covariance.
# as described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html
logger.warning_once(
"The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. "
"As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. "
"To disable this, use `mean_resizing=False`"
)
added_num_tokens = new_num_tokens - old_num_tokens
if is_deepspeed_zero3_enabled() and not is_quantized:
import deepspeed
with deepspeed.zero.GatheredParameters([old_embeddings.weight], modifier_rank=None):
self._init_added_embeddings_weights_with_mean(
old_embeddings, new_embeddings, old_embedding_dim, old_num_tokens, added_num_tokens
)
else:
self._init_added_embeddings_weights_with_mean(
old_embeddings, new_embeddings, old_embedding_dim, old_num_tokens, added_num_tokens
)
# Copy token embeddings from the previous weights # Copy token embeddings from the previous weights
# numbers of tokens to copy # numbers of tokens to copy
@@ -2259,7 +2305,11 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
return old_embeddings return old_embeddings
def _get_resized_lm_head( def _get_resized_lm_head(
self, old_lm_head: nn.Linear, new_num_tokens: Optional[int] = None, transposed: Optional[bool] = False self,
old_lm_head: nn.Linear,
new_num_tokens: Optional[int] = None,
transposed: Optional[bool] = False,
mean_resizing: bool = True,
) -> nn.Linear: ) -> nn.Linear:
""" """
Build a resized Linear Module from a provided old Linear Module. Increasing the size will add newly initialized Build a resized Linear Module from a provided old Linear Module. Increasing the size will add newly initialized
@@ -2276,6 +2326,14 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
`torch.nn.Linear` module of the model without doing anything. transposed (`bool`, *optional*, defaults `torch.nn.Linear` module of the model without doing anything. transposed (`bool`, *optional*, defaults
to `False`): Whether `old_lm_head` is transposed or not. If True `old_lm_head.size()` is `lm_head_dim, to `False`): Whether `old_lm_head` is transposed or not. If True `old_lm_head.size()` is `lm_head_dim,
vocab_size` else `vocab_size, lm_head_dim`. vocab_size` else `vocab_size, lm_head_dim`.
mean_resizing (`bool`):
Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.
Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
where the generated tokens' probabilities will not be affected by the added embeddings because initializing the new embeddings with the
old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html
Return: Return:
`torch.nn.Linear`: Pointer to the resized Linear Module or the old Linear Module if `new_num_tokens` is `torch.nn.Linear`: Pointer to the resized Linear Module or the old Linear Module if `new_num_tokens` is
@@ -2322,9 +2380,41 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
dtype=old_lm_head.weight.dtype, dtype=old_lm_head.weight.dtype,
) )
# initialize new lm head (in particular added tokens) if new_num_tokens > old_num_tokens and not mean_resizing:
# initialize new embeddings (in particular added tokens) with a mean of 0 and std equals `config.initializer_range`.
self._init_weights(new_lm_head) self._init_weights(new_lm_head)
elif new_num_tokens > old_num_tokens and mean_resizing:
# initialize new lm_head weights (in particular added tokens). The new lm_head weights
# will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance.
# as described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html
logger.warning_once(
"The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. "
"As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. "
"To disable this, use `mean_resizing=False`"
)
added_num_tokens = new_num_tokens - old_num_tokens
if is_deepspeed_zero3_enabled() and not is_quantized:
import deepspeed
params = [old_lm_head.weight]
if has_new_lm_head_bias:
params += [old_lm_head.bias]
with deepspeed.zero.GatheredParameters(params, modifier_rank=None):
self._init_added_lm_head_weights_with_mean(
old_lm_head, new_lm_head, old_lm_head_dim, old_num_tokens, added_num_tokens, transposed
)
if has_new_lm_head_bias:
self._init_added_lm_head_bias_with_mean(old_lm_head, new_lm_head, added_num_tokens)
else:
self._init_added_lm_head_weights_with_mean(
old_lm_head, new_lm_head, old_lm_head_dim, old_num_tokens, added_num_tokens, transposed
)
if has_new_lm_head_bias:
self._init_added_lm_head_bias_with_mean(old_lm_head, new_lm_head, added_num_tokens)
num_tokens_to_copy = min(old_num_tokens, new_num_tokens) num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
if is_deepspeed_zero3_enabled() and not is_quantized: if is_deepspeed_zero3_enabled() and not is_quantized:
@@ -2342,6 +2432,52 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
return new_lm_head return new_lm_head
def _init_added_embeddings_weights_with_mean(
self, old_embeddings, new_embeddings, old_embedding_dim, old_num_tokens, added_num_tokens
):
old_embeddings_weight = old_embeddings.weight.data.to(torch.float32)
mean_embeddings = torch.mean(old_embeddings_weight, axis=0)
old_centered_embeddings = old_embeddings_weight - mean_embeddings
covariance = old_centered_embeddings.T @ old_centered_embeddings / old_num_tokens
if old_embedding_dim >= old_num_tokens:
# Covarince matrix must be positive definite. For edge cases, when `vocab_size` is
# smaller than `hidden_size`, covarince matrix won't be positive definite so we
# must add the eye matrix to the covarince matrix to convert it to be positive definite.
covariance = covariance + torch.eye(old_embedding_dim, device=old_embeddings.weight.device) * 1e-3
distribution = torch.distributions.multivariate_normal.MultivariateNormal(
mean_embeddings, covariance_matrix=1e-5 * covariance
)
new_embeddings.weight.data[-1 * added_num_tokens :, :] = distribution.sample(
sample_shape=(added_num_tokens,)
).to(old_embeddings.weight.dtype)
def _init_added_lm_head_weights_with_mean(
self,
old_lm_head,
new_lm_head,
old_lm_head_dim,
old_num_tokens,
added_num_tokens,
transposed=False,
):
if transposed:
# Transpose to the desired shape for the function.
new_lm_head.weight.data = new_lm_head.weight.data.T
# The same initilization logic as Embeddings.
self._init_added_embeddings_weights_with_mean(
old_lm_head, new_lm_head, old_lm_head_dim, old_num_tokens, added_num_tokens
)
if transposed:
# Transpose again to the correct shape.
new_lm_head.weight.data = new_lm_head.weight.data.T
def _init_added_lm_head_bias_with_mean(self, old_lm_head, new_lm_head, added_num_tokens):
bias_mean = torch.mean(old_lm_head.bias.data, axis=0, dtype=torch.float32)
bias_std = torch.std(old_lm_head.bias.data, axis=0).to(torch.float32)
new_lm_head.bias.data[-1 * added_num_tokens :].normal_(mean=bias_mean, std=bias_std * 1e-5)
def _copy_lm_head_original_to_resized( def _copy_lm_head_original_to_resized(
self, new_lm_head, old_lm_head, num_tokens_to_copy, transposed, has_new_lm_head_bias self, new_lm_head, old_lm_head, num_tokens_to_copy, transposed, has_new_lm_head_bias
): ):

View File

@@ -25,6 +25,7 @@ import tempfile
import time import time
import warnings import warnings
from collections import defaultdict from collections import defaultdict
from contextlib import contextmanager
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
import numpy as np import numpy as np
@@ -45,6 +46,12 @@ from transformers import (
logging, logging,
set_seed, set_seed,
) )
from transformers.integrations import HfDeepSpeedConfig
from transformers.integrations.deepspeed import (
is_deepspeed_available,
is_deepspeed_zero3_enabled,
unset_hf_deepspeed_config,
)
from transformers.models.auto import get_values from transformers.models.auto import get_values
from transformers.models.auto.modeling_auto import ( from transformers.models.auto.modeling_auto import (
MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
@@ -75,6 +82,7 @@ from transformers.testing_utils import (
is_pt_tf_cross_test, is_pt_tf_cross_test,
require_accelerate, require_accelerate,
require_bitsandbytes, require_bitsandbytes,
require_deepspeed,
require_flash_attn, require_flash_attn,
require_non_xpu, require_non_xpu,
require_read_token, require_read_token,
@@ -134,6 +142,9 @@ if is_flax_available():
if is_torch_fx_available(): if is_torch_fx_available():
from transformers.utils.fx import _FX_SUPPORTED_MODELS_WITH_KV_CACHE, symbolic_trace from transformers.utils.fx import _FX_SUPPORTED_MODELS_WITH_KV_CACHE, symbolic_trace
if is_deepspeed_available():
import deepspeed
def _config_zero_init(config): def _config_zero_init(config):
configs_no_init = copy.deepcopy(config) configs_no_init = copy.deepcopy(config)
@@ -171,6 +182,15 @@ def _mock_all_init_weights(self):
self.tie_weights() self.tie_weights()
@contextmanager
def _deepspeed_zero3(ds_config):
dschf = HfDeepSpeedConfig(ds_config)
try:
yield dschf
finally:
unset_hf_deepspeed_config()
@require_torch @require_torch
class ModelTesterMixin: class ModelTesterMixin:
model_tester = None model_tester = None
@@ -1797,8 +1817,13 @@ class ModelTesterMixin:
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
config = copy.deepcopy(original_config) config = copy.deepcopy(original_config)
if is_deepspeed_zero3_enabled():
with deepspeed.zero.Init():
model = model_class(config)
else:
model = model_class(config) model = model_class(config)
model.to(torch_device) model.to(torch_device)
model_embed_pre_resize = model.get_input_embeddings() model_embed_pre_resize = model.get_input_embeddings()
type_model_embed_pre_resize = type(model_embed_pre_resize) type_model_embed_pre_resize = type(model_embed_pre_resize)
@@ -1813,14 +1838,25 @@ class ModelTesterMixin:
# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size + 10) model_embed = model.resize_token_embeddings(model_vocab_size + 10)
new_model_vocab_size = model.config.get_text_config().vocab_size new_model_vocab_size = model.config.get_text_config().vocab_size
self.assertEqual(new_model_vocab_size, model_vocab_size + 10) self.assertEqual(new_model_vocab_size, model_vocab_size + 10)
# Check that it actually resizes the embeddings matrix # Check that it actually resizes the embeddings matrix
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
# Check to make sure the type of embeddings returned post resizing is same as type of input # Check to make sure the type of embeddings returned post resizing is same as type of input
type_model_embed_post_resize = type(model_embed) type_model_embed_post_resize = type(model_embed)
self.assertEqual(type_model_embed_pre_resize, type_model_embed_post_resize) self.assertEqual(type_model_embed_pre_resize, type_model_embed_post_resize)
# Check that added embeddings mean is close to the old embeddings mean
if is_deepspeed_zero3_enabled():
with deepspeed.zero.GatheredParameters(model_embed.weight, modifier_rank=None):
old_embeddings_mean = torch.mean(model_embed.weight.data[:-10, :], axis=0)
new_embeddings_mean = torch.mean(model_embed.weight.data[-10:, :], axis=0)
else:
old_embeddings_mean = torch.mean(model_embed.weight.data[:-10, :], axis=0)
new_embeddings_mean = torch.mean(model_embed.weight.data[-10:, :], axis=0)
torch.testing.assert_close(old_embeddings_mean, new_embeddings_mean, atol=1e-3, rtol=1e-1)
# Check that the model can still do a forward pass successfully (every parameter should be resized) # Check that the model can still do a forward pass successfully (every parameter should be resized)
if not is_deepspeed_zero3_enabled():
# A distriputed launcher is needed for the forward pass when deepspeed is enabled
model(**self._prepare_for_class(inputs_dict, model_class)) model(**self._prepare_for_class(inputs_dict, model_class))
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
@@ -1835,6 +1871,8 @@ class ModelTesterMixin:
inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1) inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
# make sure that decoder_input_ids are resized as well # make sure that decoder_input_ids are resized as well
if not is_deepspeed_zero3_enabled():
# A distriputed launcher is needed for the forward pass when deepspeed is enabled
if "decoder_input_ids" in inputs_dict: if "decoder_input_ids" in inputs_dict:
inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1) inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
model(**self._prepare_for_class(inputs_dict, model_class)) model(**self._prepare_for_class(inputs_dict, model_class))
@@ -1847,7 +1885,11 @@ class ModelTesterMixin:
self.assertTrue(models_equal) self.assertTrue(models_equal)
config = copy.deepcopy(original_config) del model
if is_deepspeed_zero3_enabled():
with deepspeed.zero.Init():
model = model_class(config)
else:
model = model_class(config) model = model_class(config)
model.to(torch_device) model.to(torch_device)
@@ -1877,6 +1919,63 @@ class ModelTesterMixin:
): ):
model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3) model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3)
# Test when `vocab_size` is smaller than `hidden_size`.
del model
config.vocab_size = 4
if is_deepspeed_zero3_enabled():
with deepspeed.zero.Init():
model = model_class(config)
else:
model = model_class(config)
model.to(torch_device)
model_vocab_size = config.get_text_config().vocab_size
# Retrieve the embeddings and clone theme
model_embed = model.resize_token_embeddings(model_vocab_size)
cloned_embeddings = model_embed.weight.clone()
# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size + 10)
new_model_vocab_size = model.config.get_text_config().vocab_size
self.assertEqual(new_model_vocab_size, model_vocab_size + 10)
# Check that it actually resizes the embeddings matrix
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
# Check to make sure the type of embeddings returned post resizing is same as type of input
type_model_embed_post_resize = type(model_embed)
self.assertEqual(type_model_embed_pre_resize, type_model_embed_post_resize)
# Check that added embeddings mean is close to the old embeddings mean
if is_deepspeed_zero3_enabled():
with deepspeed.zero.GatheredParameters(model_embed.weight, modifier_rank=None):
old_embeddings_mean = torch.mean(model_embed.weight.data[:-10, :], axis=0)
new_embeddings_mean = torch.mean(model_embed.weight.data[-10:, :], axis=0)
else:
old_embeddings_mean = torch.mean(model_embed.weight.data[:-10, :], axis=0)
new_embeddings_mean = torch.mean(model_embed.weight.data[-10:, :], axis=0)
torch.testing.assert_close(old_embeddings_mean, new_embeddings_mean, atol=1e-3, rtol=1e-1)
@require_deepspeed
@require_torch_gpu
def test_resize_tokens_embeddings_with_deepspeed(self):
ds_config = {
"zero_optimization": {
"stage": 3,
"offload_param": {"device": "cpu", "pin_memory": True},
},
}
with _deepspeed_zero3(ds_config):
self.test_resize_tokens_embeddings()
@require_deepspeed
@require_torch_multi_gpu
def test_resize_tokens_embeddings_with_deepspeed_multi_gpu(self):
ds_config = {
"zero_optimization": {
"stage": 3,
},
}
with _deepspeed_zero3(ds_config):
self.test_resize_tokens_embeddings()
def test_resize_embeddings_untied(self): def test_resize_embeddings_untied(self):
if not self.test_resize_embeddings: if not self.test_resize_embeddings:
self.skipTest(reason="test_resize_embeddings is set to `False`") self.skipTest(reason="test_resize_embeddings is set to `False`")
@@ -1890,6 +1989,10 @@ class ModelTesterMixin:
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
config = copy.deepcopy(original_config) config = copy.deepcopy(original_config)
if is_deepspeed_zero3_enabled():
with deepspeed.zero.Init():
model = model_class(config)
else:
model = model_class(config).to(torch_device) model = model_class(config).to(torch_device)
# if no output embeddings -> leave test # if no output embeddings -> leave test
@@ -1907,8 +2010,34 @@ class ModelTesterMixin:
if output_embeds.bias is not None: if output_embeds.bias is not None:
self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10) self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
# Check that the model can still do a forward pass successfully (every parameter should be resized) # Check that the model can still do a forward pass successfully (every parameter should be resized)
if not is_deepspeed_zero3_enabled():
# A distriputed launcher is needed for the forward pass when deepspeed is enabled
model(**self._prepare_for_class(inputs_dict, model_class)) model(**self._prepare_for_class(inputs_dict, model_class))
# Test multivariate resizing.
model.resize_token_embeddings(model_vocab_size + 10)
output_embeds = model.get_output_embeddings()
# Check that added embeddings mean is close to the old embeddings mean
if is_deepspeed_zero3_enabled():
with deepspeed.zero.GatheredParameters(output_embeds.weight, modifier_rank=None):
old_embeddings_mean = torch.mean(output_embeds.weight.data[:-10, :], axis=0)
new_embeddings_mean = torch.mean(output_embeds.weight.data[-10:, :], axis=0)
else:
old_embeddings_mean = torch.mean(output_embeds.weight.data[:-10, :], axis=0)
new_embeddings_mean = torch.mean(output_embeds.weight.data[-10:, :], axis=0)
torch.testing.assert_close(old_embeddings_mean, new_embeddings_mean, atol=1e-3, rtol=1e-1)
# check if the bias is always initialized with zero.
if output_embeds.bias is not None:
if is_deepspeed_zero3_enabled():
with deepspeed.zero.GatheredParameters(output_embeds.bias, modifier_rank=None):
old_bias_mean = torch.mean(output_embeds.bias.data[:-10], axis=0)
new_bias_mean = torch.mean(output_embeds.bias.data[-10:], axis=0)
else:
old_bias_mean = torch.mean(output_embeds.bias.data[:-10], axis=0)
new_bias_mean = torch.mean(output_embeds.bias.data[-10:], axis=0)
torch.testing.assert_close(old_bias_mean, new_bias_mean, atol=1e-5, rtol=1e-2)
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
model.resize_token_embeddings(model_vocab_size - 15) model.resize_token_embeddings(model_vocab_size - 15)
new_model_vocab_size = model.config.get_text_config().vocab_size new_model_vocab_size = model.config.get_text_config().vocab_size
@@ -1925,8 +2054,33 @@ class ModelTesterMixin:
if "decoder_input_ids" in inputs_dict: if "decoder_input_ids" in inputs_dict:
inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1) inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
# Check that the model can still do a forward pass successfully (every parameter should be resized) # Check that the model can still do a forward pass successfully (every parameter should be resized)
if not is_deepspeed_zero3_enabled():
# A distriputed launcher is needed for the forward pass when deepspeed is enabled
model(**self._prepare_for_class(inputs_dict, model_class)) model(**self._prepare_for_class(inputs_dict, model_class))
@require_deepspeed
@require_torch_gpu
def test_resize_embeddings_untied_with_deepspeed(self):
ds_config = {
"zero_optimization": {
"stage": 3,
"offload_param": {"device": "cpu", "pin_memory": True},
},
}
with _deepspeed_zero3(ds_config):
self.test_resize_embeddings_untied()
@require_deepspeed
@require_torch_multi_gpu
def test_resize_embeddings_untied_with_deepspeed_multi_gpu(self):
ds_config = {
"zero_optimization": {
"stage": 3,
},
}
with _deepspeed_zero3(ds_config):
self.test_resize_embeddings_untied()
def test_model_get_set_embeddings(self): def test_model_get_set_embeddings(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()