Bias should be resized with the weights

Created a link between the linear layer bias and the model attribute bias. This does not change anything for the user nor for the conversion scripts, but allows the `resize_token_embeddings` method to resize the bias as well as the weights of the decoder. Added a test.
2020-01-14 10:13:31 -05:00
parent 6c32d8bb95
commit 100e3b6f21
4 changed files with 16 additions and 0 deletions
--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -579,6 +579,9 @@ class AlbertMLMHead(nn.Module):
        self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
        self.activation = ACT2FN[config.hidden_act]
        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
        self.decoder.bias = self.bias
    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.activation(hidden_states)
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -481,6 +481,9 @@ class BertLMPredictionHead(nn.Module):
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
        self.decoder.bias = self.bias
    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states) + self.bias
--- a/src/transformers/modeling_roberta.py
+++ b/src/transformers/modeling_roberta.py
@@ -306,6 +306,9 @@ class RobertaLMHead(nn.Module):
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
        self.decoder.bias = self.bias
    def forward(self, features, **kwargs):
        x = self.dense(features)
        x = gelu(x)
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -487,6 +487,8 @@ class ModelTesterMixin:
            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
            # Check that it actually resizes the embeddings matrix
            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
            # Check that the model can still do a forward pass successfully (every parameter should be resized)
            model(**inputs_dict)
            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
@@ -494,6 +496,11 @@ class ModelTesterMixin:
            # Check that it actually resizes the embeddings matrix
            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
            # Check that the model can still do a forward pass successfully (every parameter should be resized)
            # Input ids should be clamped to the maximum size of the vocabulary
            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
            model(**inputs_dict)
            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
            models_equal = True
            for p1, p2 in zip(cloned_embeddings, model_embed.weight):