From c30139a013f8d65dc691efaac107691bb798419e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 30 Apr 2019 10:45:26 +0200
Subject: [PATCH] add special tokens to gpt-2

---
 pytorch_pretrained_bert/modeling_gpt2.py   | 60 +++++++++++++++++++---
 pytorch_pretrained_bert/modeling_openai.py |  6 +--
 tests/modeling_gpt2_test.py                | 10 ++--
 3 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 063c525d98..05a748d43c 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -107,6 +107,7 @@ class GPT2Config(object):
     def __init__(
         self,
         vocab_size_or_config_json_file=50257,
+        n_special=0,
         n_positions=1024,
         n_ctx=1024,
         n_embd=768,
@@ -119,6 +120,7 @@ class GPT2Config(object):
 
         Args:
             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
             n_positions: Number of positional embeddings.
             n_ctx: Size of the causal mask (usually same as n_positions).
             n_embd: Dimensionality of the embeddings and hidden states.
@@ -137,6 +139,7 @@ class GPT2Config(object):
                 self.__dict__[key] = value
         elif isinstance(vocab_size_or_config_json_file, int):
             self.vocab_size = vocab_size_or_config_json_file
+            self.n_special = n_special
             self.n_ctx = n_ctx
             self.n_positions = n_positions
             self.n_embd = n_embd
@@ -150,6 +153,10 @@ class GPT2Config(object):
                 "or the path to a pretrained model config file (str)"
             )
 
+    @property
+    def total_tokens_embeddings(self):
+        return self.vocab_size + self.n_special
+
     @classmethod
     def from_dict(cls, json_object):
         """Constructs a `GPT2Config` from a Python dictionary of parameters."""
@@ -290,11 +297,12 @@ class GPT2LMHead(nn.Module):
     def __init__(self, model_embeddings_weights, config):
         super(GPT2LMHead, self).__init__()
         self.n_embd = config.n_embd
+        embed_shape = model_embeddings_weights.shape
+        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
         self.set_embeddings_weights(model_embeddings_weights)
 
     def set_embeddings_weights(self, model_embeddings_weights):
         embed_shape = model_embeddings_weights.shape
-        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
         self.decoder.weight = model_embeddings_weights  # Tied weights
 
     def forward(self, hidden_state):
@@ -345,7 +353,7 @@ class GPT2PreTrainedModel(nn.Module):
             )
         self.config = config
 
-    def set_tied(self):
+    def set_num_special_tokens(self, num_special_tokens):
         pass
 
     def init_weights(self, module):
@@ -475,14 +483,32 @@ class GPT2PreTrainedModel(nn.Module):
                 "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
             )
 
-        # Make sure we are still sharing the output and input embeddings after loading weights
-        model.set_tied()
+        # Add additional embeddings for special tokens if needed
+        # This step also make sure we are still sharing the output and input embeddings after loading weights
+        model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special)
         return model
 
 
 class GPT2Model(GPT2PreTrainedModel):
     """OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners").
 
+    GPT-2 use a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1]                  ______________________
+
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+        total_tokens_embeddings = config.vocab_size + config.n_special
+    You should use the associate indices to index the embeddings.
+
     Params:
         config: a GPT2Config class instance with the configuration to build a new model
 
@@ -529,6 +555,20 @@ class GPT2Model(GPT2PreTrainedModel):
 
         self.apply(self.init_weights)
 
+    def set_num_special_tokens(self, num_special_tokens):
+        " Update input embeddings with new embedding matrice if needed "
+        if self.config.n_special == num_special_tokens:
+            return
+        # Update config
+        self.config.n_special = num_special_tokens
+        # Build new embeddings and initialize all new embeddings (in particular the special tokens)
+        old_embed = self.wte
+        self.wte = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
+        self.wte.to(old_embed.weight.device)
+        self.init_weights(self.wte)
+        # Copy word embeddings from the previous weights
+        self.wte.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
+
     def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
         if past is None:
             past_length = 0
@@ -610,9 +650,11 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
         self.apply(self.init_weights)
 
-    def set_tied(self):
-        """ Make sure we are sharing the embeddings
+    def set_num_special_tokens(self, num_special_tokens):
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
         """
+        self.transformer.set_num_special_tokens(num_special_tokens)
         self.lm_head.set_embeddings_weights(self.transformer.wte.weight)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None):
@@ -687,9 +729,11 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         self.multiple_choice_head = GPT2MultipleChoiceHead(config)
         self.apply(self.init_weights)
 
-    def set_tied(self):
-        """ Make sure we are sharing the embeddings
+    def set_num_special_tokens(self, num_special_tokens):
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
         """
+        self.transformer.set_num_special_tokens(num_special_tokens)
         self.lm_head.set_embeddings_weights(self.transformer.wte.weight)
 
     def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None, past=None):
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index f956462ddb..7ac3782b42 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -344,11 +344,12 @@ class OpenAIGPTLMHead(nn.Module):
     def __init__(self, model_embeddings_weights, config):
         super(OpenAIGPTLMHead, self).__init__()
         self.n_embd = config.n_embd
+        embed_shape = model_embeddings_weights.shape
+        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
         self.set_embeddings_weights(model_embeddings_weights)
 
     def set_embeddings_weights(self, model_embeddings_weights):
         embed_shape = model_embeddings_weights.shape
-        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
         self.decoder.weight = model_embeddings_weights  # Tied weights
 
     def forward(self, hidden_state):
@@ -592,8 +593,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
 
     def __init__(self, config):
         super(OpenAIGPTModel, self).__init__(config)
-        num_tokens = config.vocab_size + config.n_special
-        self.tokens_embed = nn.Embedding(num_tokens, config.n_embd)
+        self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
         self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
         block = Block(config.n_ctx, config, scale=True)
diff --git a/tests/modeling_gpt2_test.py b/tests/modeling_gpt2_test.py
index 8f4581b37f..6804b794c5 100644
--- a/tests/modeling_gpt2_test.py
+++ b/tests/modeling_gpt2_test.py
@@ -41,6 +41,7 @@ class GPT2ModelTest(unittest.TestCase):
                      use_token_type_ids=True,
                      use_labels=True,
                      vocab_size=99,
+                     n_special=1,
                      n_positions=33,
                      n_embd=32,
                      n_layer=5,
@@ -58,6 +59,7 @@ class GPT2ModelTest(unittest.TestCase):
             self.use_token_type_ids = use_token_type_ids
             self.use_labels = use_labels
             self.vocab_size = vocab_size
+            self.n_special = n_special
             self.n_positions = n_positions
             self.n_embd = n_embd
             self.n_layer = n_layer
@@ -69,7 +71,8 @@ class GPT2ModelTest(unittest.TestCase):
             self.scope = scope
 
         def prepare_config_and_inputs(self):
-            input_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size)
+            total_num_tokens = self.vocab_size + self.n_special
+            input_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
 
             position_ids = None
             if self.use_position_ids:
@@ -90,6 +93,7 @@ class GPT2ModelTest(unittest.TestCase):
 
             config = GPT2Config(
                 vocab_size_or_config_json_file=self.vocab_size,
+                n_special=self.n_special,
                 n_positions=self.n_positions,
                 n_embd=self.n_embd,
                 n_layer=self.n_layer,
@@ -130,7 +134,7 @@ class GPT2ModelTest(unittest.TestCase):
             return outputs
 
         def check_gpt2_lm_head_output(self, result):
-            total_voc = self.vocab_size
+            total_voc = self.n_special + self.vocab_size
             self.parent.assertListEqual(
                 list(result["lm_logits"].size()),
                 [self.batch_size, self.n_choices, self.seq_length, total_voc])
@@ -157,7 +161,7 @@ class GPT2ModelTest(unittest.TestCase):
             return outputs
 
         def check_gpt2_double_heads_output(self, result):
-            total_voc = self.vocab_size
+            total_voc = self.n_special + self.vocab_size
             self.parent.assertListEqual(
                 list(result["lm_logits"].size()),
                 [self.batch_size, self.n_choices, self.seq_length, total_voc])