From 3a848111e6c5a10a4f04f272476de86af78d4a36 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 29 Jan 2019 11:00:11 +0100 Subject: [PATCH] update config, docstrings and readme to switch to seperated tokens and position embeddings --- README.md | 37 +++--- pytorch_pretrained_bert/modeling_openai.py | 130 ++++++++++----------- 2 files changed, 80 insertions(+), 87 deletions(-) diff --git a/README.md b/README.md index be0765f4bb..b124585bbe 100644 --- a/README.md +++ b/README.md @@ -391,35 +391,36 @@ An example on how to use this class is given in the [`run_squad.py`](./examples/ `OpenAIGPTModel` is the basic OpenAI GPT Transformer model with a layer of summed token and position embeddings followed by a series of 12 identical self-attention blocks. -The main implementation difference between BERT and the OpenAI is the use, in OpenAI GPT, of a single embedding matrix to store the word, special (`[SEP]`, `[CLS]`...) token and position embeddings. -The embeddings are ordered as follow in the word embeddings matrice: +OpenAI GPT use a single embedding matrix to store the word and special embeddings. +Special tokens embeddings are additional tokens that are not pre-trained: `[SEP]`, `[CLS]`... +Special tokens need to be trained during the fine-tuning if you use them. +The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function. +The embeddings are ordered as follow in the token embeddings matrice: + +```python [0, ---------------------- ... -> word embeddings config.vocab_size - 1, ______________________ config.vocab_size, ... -> special embeddings - config.vocab_size + config.n_special - 1, ______________________ - config.vocab_size + config.n_special, - ... -> position embeddings - total_num_embeddings - 1] ______________________ + config.vocab_size + config.n_special - 1] ______________________ +``` -where total_num_embeddings can be obtained as config.total_num_embeddings and is: - - total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx +where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is: + `total_tokens_embeddings = config.vocab_size + config.n_special` You should use the associate indices to index the embeddings. -The special tokens embeddings (`[SEP]`, `[CLS]`...) are not pre-trained and need to be trained during the fine-tuning if you use them. - -The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function. - The inputs and output are **identical to the TensorFlow model inputs and outputs**. We detail them here. This model takes as *inputs*: [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py) -- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[ -- `position_ids`: an optional torch.LongTensor with the same shape as input_ids with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_ctx - 1[. -- `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids. You can use it to add a third embedding (the previous two being the word and position embeddings) to each token in the sentence. +- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[ +- `position_ids`: an optional torch.LongTensor with the same shape as input_ids + with the position indices (selected in the range [0, config.n_positions - 1[. +- `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids + You can use it to add a third type of embedding to each input token in the sequence + (the previous two being the word and position embeddings). The input, position and token_type embeddings are summed inside the Transformer before the first self-attention block. This model *outputs*: - `hidden_states`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids) @@ -435,7 +436,7 @@ This model *outputs*: - if `lm_labels` is not `None`: Outputs the language modeling loss. - else: - Outputs `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_num_embeddings] (or more generally [d_1, ..., d_n, total_num_embeddings] were d_1 ... d_n are the dimension of input_ids) + Outputs `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_tokens_embeddings] (or more generally [d_1, ..., d_n, total_tokens_embeddings] were d_1 ... d_n are the dimension of input_ids) #### 11. `OpenAIGPTDoubleHeadsModel` @@ -452,7 +453,7 @@ This model *outputs*: - if `lm_labels` and `multiple_choice_labels` are not `None`: Outputs a tuple of losses with the language modeling loss and the multiple choice loss. - else Outputs a tuple with: - - `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_num_embeddings] + - `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_tokens_embeddings] - `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices] diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index 2e2dc56984..14d5cf7ef2 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -185,8 +185,8 @@ class OpenAIGPTConfig(object): ) @property - def total_num_embeddings(self): - return self.vocab_size + self.n_special + self.n_positions + def total_tokens_embeddings(self): + return self.vocab_size + self.n_special @classmethod def from_dict(cls, json_object): @@ -533,45 +533,44 @@ class OpenAIGPTPreTrainedModel(nn.Module): "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs)) ) # Add additional embeddings for special tokens if needed - if num_special_tokens is not None and num_special_tokens != config.n_special: - model.set_num_special_tokens(num_special_tokens) + # This step also make sure we are still sharing the output and input embeddings after loading weights + model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special) return model class OpenAIGPTModel(OpenAIGPTPreTrainedModel): """OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training"). - The main implementation difference between BERT and the OpenAI is the use, in OpenAI GPT, of a single embedding matrix - to store the word, special ([SEP], [CLS]...) and position embeddings. - The embeddings are ordered as follow in the word embeddings matrice: + OpenAI GPT use a single embedding matrix to store the word and special embeddings. + Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]... + Special tokens need to be trained during the fine-tuning if you use them. + The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function. + + The embeddings are ordered as follow in the token embeddings matrice: [0, ---------------------- ... -> word embeddings config.vocab_size - 1, ______________________ config.vocab_size, ... -> special embeddings - config.vocab_size + config.n_special - 1, ______________________ - config.vocab_size + config.n_special, - ... -> position embeddings - total_num_embeddings - 1] ______________________ + config.vocab_size + config.n_special - 1] ______________________ - where total_num_embeddings can be obtained as config.total_num_embeddings and is: - total_num_embeddings = config.vocab_size + config.n_special + config.n_positions + where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is: + total_tokens_embeddings = config.vocab_size + config.n_special You should use the associate indices to index the embeddings. - The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them. - The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function. - Params: config: a OpenAIGPTConfig class instance with the configuration to build a new model Inputs: `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] - were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[ + were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[ `position_ids`: an optional torch.LongTensor with the same shape as input_ids - with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_positions - 1[. + with the position indices (selected in the range [0, config.n_positions - 1[. `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids - You can use it to add a third embedding (the previous two being the word and position embeddings) - to each token in the sentence. + You can use it to add a third type of embedding to each input token in the sequence + (the previous two being the word and position embeddings). + The input, position and token_type embeddings are summed inside the Transformer before the first + self-attention block. Outputs: `hidden_states`: the encoded-hidden-states at the top of the model @@ -603,12 +602,14 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): # nn.init.normal_(self.embed.weight, std=0.02) def set_num_special_tokens(self, num_special_tokens): - " Update input embeddings with new embedding matrice " + " Update input embeddings with new embedding matrice if needed " + if self.config.n_special == num_special_tokens: + return # Update config self.config.n_special = num_special_tokens # # Build new embeddings and initialize old_embed = self.tokens_embed - self.tokens_embed = nn.Embedding(self.config.total_num_embeddings, self.config.n_embd) + self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd) # Initialize all new embeddings (in particular the special tokens) self.init_weights(self.tokens_embed) # Copy word and positional embeddings from the previous weights @@ -646,39 +647,36 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): """OpenAI GPT model with a Language Modeling head ("Improving Language Understanding by Generative Pre-Training"). - There are two main implementation differences between BERT and the OpenAI GPT: - - the use of an LM loss in OpenAI GPT which means the Transformer is trained to predict the NEXT token for each input token - vs. predict the SAME token for BERT (i.e. you need to shift your labels to the right) - - the use, in OpenAI GPT, of a single embedding matrix to store the word, special ([SEP], [CLS]...) and position embeddings. - The embeddings are ordered as follow in the word embeddings matrice: + OpenAI GPT use a single embedding matrix to store the word and special embeddings. + Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]... + Special tokens need to be trained during the fine-tuning if you use them. + The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function. + + The embeddings are ordered as follow in the token embeddings matrice: [0, ---------------------- ... -> word embeddings config.vocab_size - 1, ______________________ config.vocab_size, ... -> special embeddings - config.vocab_size + config.n_special - 1, ______________________ - config.vocab_size + config.n_special, - ... -> position embeddings - total_num_embeddings - 1] ______________________ + config.vocab_size + config.n_special - 1] ______________________ - where total_num_embeddings can be obtained as config.total_num_embeddings and is: - total_num_embeddings = config.vocab_size + config.n_special + config.n_positions - You should use these indices to index the word, special and position embeddings. - - The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them. - The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function. + where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is: + total_tokens_embeddings = config.vocab_size + config.n_special + You should use the associate indices to index the embeddings. Params: config: a OpenAIGPTConfig class instance with the configuration to build a new model Inputs: `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] - were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[ + were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[ `position_ids`: an optional torch.LongTensor with the same shape as input_ids - with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_positions - 1[. + with the position indices (selected in the range [0, config.n_positions - 1[. `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids - You can use it to add a third embedding (the previous two being the word and position embeddings) - to each token in the sentence. + You can use it to add a third type of embedding to each input token in the sequence + (the previous two being the word and position embeddings). + The input, position and token_type embeddings are summed inside the Transformer before the first + self-attention block. `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size] @@ -687,8 +685,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): if `lm_labels` is not `None`: Outputs the language modeling loss. else: - `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_num_embeddings] - (or more generally [d_1, ..., d_n, total_num_embeddings] were d_1 ... d_n are the dimension of input_ids) + `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_tokens_embeddings] + (or more generally [d_1, ..., d_n, total_tokens_embeddings] were d_1 ... d_n are the dimension of input_ids) Example usage: ```python @@ -726,45 +724,39 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): """OpenAI GPT model with a Language Modeling and a Multiple Choice heads ("Improving Language Understanding by Generative Pre-Training"). - There are two main implementation differences between BERT and the OpenAI GPT: - - the use of an LM loss in OpenAI GPT which means the Transformer is trained to predict the NEXT token for each input token - vs. predict the SAME token for BERT (i.e. you need to shift your labels to the right) - - the use, in OpenAI GPT, of a single embedding matrix to store the word, special ([SEP], [CLS]...) and position embeddings. - The embeddings are ordered as follow in the word embeddings matrice: + OpenAI GPT use a single embedding matrix to store the word and special embeddings. + Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]... + Special tokens need to be trained during the fine-tuning if you use them. + The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function. + + The embeddings are ordered as follow in the token embeddings matrice: [0, ---------------------- ... -> word embeddings config.vocab_size - 1, ______________________ config.vocab_size, ... -> special embeddings - config.vocab_size + config.n_special - 1, ______________________ - config.vocab_size + config.n_special, - ... -> position embeddings - total_num_embeddings - 1] ______________________ + config.vocab_size + config.n_special - 1] ______________________ - where total_num_embeddings can be obtained as config.total_num_embeddings and is: - total_num_embeddings = config.vocab_size + config.n_special + config.n_positions - You should use these indices to index the word, special and position embeddings. - - The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them. - The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function. + where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is: + total_tokens_embeddings = config.vocab_size + config.n_special + You should use the associate indices to index the embeddings. Params: config: a OpenAIGPTConfig class instance with the configuration to build a new model Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] - with the word BPE token indices selected in the range [0, config.vocab_size[ - `mc_token_mask`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] - with a value of 1 were the last hidden state is (usually the [CLS] token) and 0 otherwise. + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] + were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[ `position_ids`: an optional torch.LongTensor with the same shape as input_ids - with the position indices (selected in the range [config.vocab_size + config.n_special, - config.vocab_size + config.n_special + config.n_positions - 1[. + with the position indices (selected in the range [0, config.n_positions - 1[. `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids - You can use it to add a third embedding (the previous two being the word and position embeddings) - to each token in the sentence. + You can use it to add a third type of embedding to each input token in the sequence + (the previous two being the word and position embeddings). + The input, position and token_type embeddings are summed inside the Transformer before the first + self-attention block. `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length] - with indices selected in [-1, 0, ..., total_num_embeddings]. All labels set to -1 are ignored (masked), the loss - is only computed for the labels set in [0, ..., total_num_embeddings] + with indices selected in [-1, 0, ..., total_tokens_embeddings]. All labels set to -1 are ignored (masked), the loss + is only computed for the labels set in [0, ..., total_tokens_embeddings] `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size] with indices selected in [0, ..., num_choices]. @@ -772,7 +764,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): if `lm_labels` and `multiple_choice_labels` are not `None`: Outputs a tuple of losses with the language modeling loss and the multiple choice loss. else: a tuple with - `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_num_embeddings] + `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_tokens_embeddings] `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices] Example usage: