From bd404735a7f282a41b11e240eb7c880e329567c3 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 12 Jul 2019 00:02:49 +0200 Subject: [PATCH] embeddings resizing + tie_weights --- pytorch_transformers/modeling_bert.py | 53 +++++-- pytorch_transformers/modeling_gpt2.py | 148 +++-------------- pytorch_transformers/modeling_openai.py | 150 ++++-------------- pytorch_transformers/modeling_transfo_xl.py | 7 + pytorch_transformers/modeling_utils.py | 41 ++++- pytorch_transformers/modeling_xlm.py | 13 +- pytorch_transformers/modeling_xlnet.py | 12 +- .../tests/modeling_bert_test.py | 2 +- ...sts_commons.py => modeling_common_test.py} | 45 +++++- .../tests/modeling_gpt2_test.py | 2 +- .../tests/modeling_openai_test.py | 2 +- .../tests/modeling_transfo_xl_test.py | 2 +- .../tests/modeling_utils_test.py | 47 ------ .../tests/modeling_xlm_test.py | 2 +- .../tests/modeling_xlnet_test.py | 2 +- 15 files changed, 196 insertions(+), 332 deletions(-) rename pytorch_transformers/tests/{modeling_tests_commons.py => modeling_common_test.py} (91%) delete mode 100644 pytorch_transformers/tests/modeling_utils_test.py diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py index 6da6a5e507..d88c57bb79 100644 --- a/pytorch_transformers/modeling_bert.py +++ b/pytorch_transformers/modeling_bert.py @@ -507,23 +507,17 @@ class BertPredictionHeadTransform(nn.Module): class BertLMPredictionHead(nn.Module): - def __init__(self, config, bert_model_embedding_weights): + def __init__(self, config): super(BertLMPredictionHead, self).__init__() self.transform = BertPredictionHeadTransform(config) - self.torchscript = config.torchscript # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. - self.decoder = nn.Linear(bert_model_embedding_weights.size(1), - bert_model_embedding_weights.size(0), + self.decoder = nn.Linear(config.hidden_size, + config.vocab_size, bias=False) - if self.torchscript: - self.decoder.weight = nn.Parameter(bert_model_embedding_weights.clone()) - else: - self.decoder.weight = bert_model_embedding_weights - - self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0))) + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) def forward(self, hidden_states): hidden_states = self.transform(hidden_states) @@ -532,9 +526,9 @@ class BertLMPredictionHead(nn.Module): class BertOnlyMLMHead(nn.Module): - def __init__(self, config, bert_model_embedding_weights): + def __init__(self, config): super(BertOnlyMLMHead, self).__init__() - self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) + self.predictions = BertLMPredictionHead(config) def forward(self, sequence_output): prediction_scores = self.predictions(sequence_output) @@ -552,9 +546,9 @@ class BertOnlyNSPHead(nn.Module): class BertPreTrainingHeads(nn.Module): - def __init__(self, config, bert_model_embedding_weights): + def __init__(self, config): super(BertPreTrainingHeads, self).__init__() - self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) + self.predictions = BertLMPredictionHead(config) self.seq_relationship = nn.Linear(config.hidden_size, 2) def forward(self, sequence_output, pooled_output): @@ -619,6 +613,11 @@ class BertModel(BertPreTrainedModel): self.apply(self.init_weights) + def _resize_token_embeddings(self, new_num_tokens): + old_embeddings = self.embeddings.word_embeddings + new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens) + self.embeddings.word_embeddings = new_embeddings + def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} @@ -750,9 +749,20 @@ class BertForPreTraining(BertPreTrainedModel): super(BertForPreTraining, self).__init__(config) self.bert = BertModel(config) - self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight) + self.cls = BertPreTrainingHeads(config) self.apply(self.init_weights) + self.tie_weights() + + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. + """ + input_embeddings = self.bert.embeddings.word_embeddings.weight + if self.config.torchscript: + self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone()) + else: + self.cls.predictions.decoder.weight = input_embeddings # Tied weights def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None, head_mask=None): @@ -845,9 +855,20 @@ class BertForMaskedLM(BertPreTrainedModel): super(BertForMaskedLM, self).__init__(config) self.bert = BertModel(config) - self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight) + self.cls = BertOnlyMLMHead(config) self.apply(self.init_weights) + self.tie_weights() + + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. + """ + input_embeddings = self.bert.embeddings.word_embeddings.weight + if self.config.torchscript: + self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone()) + else: + self.cls.predictions.decoder.weight = input_embeddings # Tied weights def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None): """ diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py index 495e002529..06f933147f 100644 --- a/pytorch_transformers/modeling_gpt2.py +++ b/pytorch_transformers/modeling_gpt2.py @@ -104,7 +104,6 @@ class GPT2Config(PretrainedConfig): Args: vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. - n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...) n_positions: Number of positional embeddings. n_ctx: Size of the causal mask (usually same as n_positions). n_embd: Dimensionality of the embeddings and hidden states. @@ -119,14 +118,12 @@ class GPT2Config(PretrainedConfig): embd_pdrop: The dropout ratio for the embeddings. initializer_range: The sttdev of the truncated_normal_initializer for initializing all weight matrices. - predict_special_tokens: should we predict special tokens (when the model has a LM head) """ pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP def __init__( self, vocab_size_or_config_json_file=50257, - n_special=0, n_positions=1024, n_ctx=1024, n_embd=768, @@ -137,7 +134,6 @@ class GPT2Config(PretrainedConfig): attn_pdrop=0.1, layer_norm_epsilon=1e-5, initializer_range=0.02, - predict_special_tokens=True, num_labels=1, summary_type='token_ids', @@ -151,7 +147,6 @@ class GPT2Config(PretrainedConfig): Args: vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. - n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...) n_positions: Number of positional embeddings. n_ctx: Size of the causal mask (usually same as n_positions). n_embd: Dimensionality of the embeddings and hidden states. @@ -166,7 +161,6 @@ class GPT2Config(PretrainedConfig): embd_pdrop: The dropout ratio for the embeddings. initializer_range: The sttdev of the truncated_normal_initializer for initializing all weight matrices. - predict_special_tokens: should we predict special tokens (when the model has a LM head) """ super(GPT2Config, self).__init__(**kwargs) @@ -178,7 +172,6 @@ class GPT2Config(PretrainedConfig): self.__dict__[key] = value elif isinstance(vocab_size_or_config_json_file, int): self.vocab_size = vocab_size_or_config_json_file - self.n_special = n_special self.n_ctx = n_ctx self.n_positions = n_positions self.n_embd = n_embd @@ -189,7 +182,6 @@ class GPT2Config(PretrainedConfig): self.attn_pdrop = attn_pdrop self.layer_norm_epsilon = layer_norm_epsilon self.initializer_range = initializer_range - self.predict_special_tokens = predict_special_tokens self.num_labels = num_labels self.summary_type = summary_type @@ -203,10 +195,6 @@ class GPT2Config(PretrainedConfig): "or the path to a pretrained model config file (str)" ) - @property - def total_tokens_embeddings(self): - return self.vocab_size + self.n_special - @property def hidden_size(self): return self.n_embd @@ -347,34 +335,6 @@ class Block(nn.Module): return outputs # x, present, (attentions) -class GPT2LMHead(nn.Module): - """ Language Model Head for the transformer """ - - def __init__(self, model_embeddings_weights, config): - super(GPT2LMHead, self).__init__() - self.n_embd = config.n_embd - self.vocab_size = config.vocab_size - self.predict_special_tokens = config.predict_special_tokens - self.torchscript = config.torchscript - embed_shape = model_embeddings_weights.shape - self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False) - self.set_embeddings_weights(model_embeddings_weights) - - def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True): - self.predict_special_tokens = predict_special_tokens - # Export to TorchScript can't handle parameter sharing so we are cloning them. - if self.torchscript: - self.decoder.weight = nn.Parameter(model_embeddings_weights.clone()) - else: - self.decoder.weight = model_embeddings_weights # Tied weights - - def forward(self, hidden_state): - lm_logits = self.decoder(hidden_state) - if not self.predict_special_tokens: - lm_logits = lm_logits[..., :self.vocab_size] - return lm_logits - - class GPT2PreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. @@ -400,36 +360,6 @@ class GPT2PreTrainedModel(PreTrainedModel): module.bias.data.zero_() module.weight.data.fill_(1.0) - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): - """ - Instantiate a GPT2PreTrainedModel from a pre-trained model file or a pytorch state dict. - Download and cache the pre-trained model file if needed. - - Params: - pretrained_model_name_or_path: either: - - a str with the name of a pre-trained model to load selected in the list of: - . `gpt2` - - a path or url to a pretrained model archive containing: - . `gpt2_config.json` a configuration file for the model - . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance - - a path or url to a pretrained model archive containing: - . `gpt2_config.json` a configuration file for the model - . a TensorFlow checkpoint with trained weights - from_tf: should we load the weights from a locally saved TensorFlow checkpoint - cache_dir: an optional path to a folder in which the pre-trained models will be cached. - state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models - *inputs, **kwargs: additional input for the specific GPT2 class - """ - num_special_tokens = kwargs.pop('num_special_tokens', None) - - model = super().from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - - # Add additional embeddings for special tokens if needed - # This step also make sure we are still sharing the output and input embeddings after loading weights - model.set_num_special_tokens(num_special_tokens) - return model - class GPT2Model(GPT2PreTrainedModel): """OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners"). @@ -447,13 +377,13 @@ class GPT2Model(GPT2PreTrainedModel): config.vocab_size - 1, ______________________ config.vocab_size, ... -> special embeddings - config.vocab_size + config.n_special - 1] ______________________ + config.vocab_size + n_special - 1] ______________________ - where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is equal to + where total_tokens_embeddings is equal to :: - total_tokens_embeddings = config.vocab_size + config.n_special + total_tokens_embeddings = vocab_size + n_special You should use the associated indices to index the embeddings. @@ -474,7 +404,7 @@ class GPT2Model(GPT2PreTrainedModel): self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions - self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd) + self.wte = nn.Embedding(config.vocab_size, config.n_embd) self.wpe = nn.Embedding(config.n_positions, config.n_embd) self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)]) @@ -482,26 +412,8 @@ class GPT2Model(GPT2PreTrainedModel): self.apply(self.init_weights) - def set_num_special_tokens(self, num_special_tokens=None): - """ - Update input embeddings with new embedding matrix if needed. - - Args: - num_special_tokens: Special tokens to be added to the embedding matrix - - TODO Lysandre filled args - """ - if num_special_tokens is None or self.config.n_special == num_special_tokens: - return - # Update config - self.config.n_special = num_special_tokens - # Build new embeddings and initialize all new embeddings (in particular the special tokens) - old_embed = self.wte - self.wte = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd) - self.wte.to(old_embed.weight.device) - self.init_weights(self.wte) - # Copy word embeddings from the previous weights - self.wte.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :] + def _resize_token_embeddings(self, new_num_tokens): + self.wte = self._get_resized_embeddings(self.wte, new_num_tokens) def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. @@ -641,23 +553,20 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): def __init__(self, config): super(GPT2LMHeadModel, self).__init__(config) self.transformer = GPT2Model(config) - self.lm_head = GPT2LMHead(self.transformer.wte.weight, config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + self.apply(self.init_weights) + self.tie_weights() - def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. """ - Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings. - - Args: - num_special_tokens: Special tokens to be added to the embedding matrix - predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``. - Defaults to True. - - TODO Lysandre filled args - """ - self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens - self.transformer.set_num_special_tokens(num_special_tokens) - self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens) + input_embeddings = self.transformer.wte.weight + if self.config.torchscript: + self.lm_head.weight = nn.Parameter(input_embeddings.clone()) + else: + self.lm_head.weight = input_embeddings # Tied weights def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None): """ @@ -740,25 +649,20 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): def __init__(self, config): super(GPT2DoubleHeadsModel, self).__init__(config) self.transformer = GPT2Model(config) - self.lm_head = GPT2LMHead(self.transformer.wte.weight, config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.multiple_choice_head = SequenceSummary(config) self.apply(self.init_weights) - def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. """ - Update input and output embeddings with new embedding matrix.Make sure we are sharing the embeddings - - Args: - num_special_tokens: Special tokens to be added to the embedding matrix - predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``. - Defaults to True. - - TODO Lysandre filled args - """ - self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens - self.transformer.set_num_special_tokens(num_special_tokens) - self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens) + input_embeddings = self.transformer.wte.weight + if self.config.torchscript: + self.lm_head.weight = nn.Parameter(input_embeddings.clone()) + else: + self.lm_head.weight = input_embeddings # Tied weights def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None, past=None, head_mask=None): diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py index aa35b163f1..ebf1035d21 100644 --- a/pytorch_transformers/modeling_openai.py +++ b/pytorch_transformers/modeling_openai.py @@ -156,7 +156,6 @@ class OpenAIGPTConfig(PretrainedConfig): def __init__( self, vocab_size_or_config_json_file=40478, - n_special=0, n_positions=512, n_ctx=512, n_embd=768, @@ -190,7 +189,6 @@ class OpenAIGPTConfig(PretrainedConfig): self.__dict__[key] = value elif isinstance(vocab_size_or_config_json_file, int): self.vocab_size = vocab_size_or_config_json_file - self.n_special = n_special self.n_ctx = n_ctx self.n_positions = n_positions self.n_embd = n_embd @@ -216,10 +214,6 @@ class OpenAIGPTConfig(PretrainedConfig): "or the path to a pretrained model config file (str)" ) - @property - def total_tokens_embeddings(self): - return self.vocab_size + self.n_special - @property def hidden_size(self): return self.n_embd @@ -355,34 +349,6 @@ class Block(nn.Module): return outputs -class OpenAIGPTLMHead(nn.Module): - """ Language Model Head for the transformer """ - - def __init__(self, model_embeddings_weights, config): - super(OpenAIGPTLMHead, self).__init__() - self.n_embd = config.n_embd - self.vocab_size = config.vocab_size - self.predict_special_tokens = config.predict_special_tokens - self.torchscript = config.torchscript - embed_shape = model_embeddings_weights.shape - self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False) - self.set_embeddings_weights(model_embeddings_weights) - - def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True): - self.predict_special_tokens = predict_special_tokens - - if self.torchscript: - self.decoder.weight = nn.Parameter(model_embeddings_weights.clone()) - else: - self.decoder.weight = model_embeddings_weights # Tied weights - - def forward(self, hidden_state): - lm_logits = self.decoder(hidden_state) - if not self.predict_special_tokens: - lm_logits = lm_logits[..., :self.vocab_size] - return lm_logits - - class OpenAIGPTPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. @@ -408,36 +374,6 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel): module.bias.data.zero_() module.weight.data.fill_(1.0) - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): - """ - Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict. - Download and cache the pre-trained model file if needed. - - Params: - pretrained_model_name_or_path: either: - - a str with the name of a pre-trained model to load selected in the list of: - - a path or url to a pretrained model archive containing: - . `config.json` a configuration file for the model - . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance - - a path or url to a pretrained model archive containing: - . `config.json` a configuration file for the model - . a series of NumPy files containing OpenAI TensorFlow trained weights - from_tf: should we load the weights from a locally saved TensorFlow checkpoint - cache_dir: an optional path to a folder in which the pre-trained models will be cached. - state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models - *inputs, **kwargs: additional input for the specific OpenAI-GPT class - """ - num_special_tokens = kwargs.get('num_special_tokens', None) - kwargs.pop('num_special_tokens', None) - - model = super(PreTrainedModel, cls).from_pretrained(pretrained_model_name_or_path, pretrained_model_name_or_path, *inputs, **kwargs) - - # Add additional embeddings for special tokens if needed - # This step also make sure we are still sharing the output and input embeddings after loading weights - model.set_num_special_tokens(num_special_tokens) - return model - class OpenAIGPTModel(OpenAIGPTPreTrainedModel): """OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training"). @@ -457,13 +393,13 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): config.vocab_size - 1, ______________________ config.vocab_size, ... -> special embeddings - config.vocab_size + config.n_special - 1] ______________________ + config.vocab_size + n_special - 1] ______________________ - where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is: + where ``total_tokens_embeddings`` is: :: - total_tokens_embeddings = config.vocab_size + config.n_special + total_tokens_embeddings = config.vocab_size + n_special You should use the associated indices to index the embeddings. @@ -485,34 +421,15 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states - self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd) + self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd) self.positions_embed = nn.Embedding(config.n_positions, config.n_embd) self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)]) self.apply(self.init_weights) - def set_num_special_tokens(self, num_special_tokens=None): - """ - Update input embeddings with new embedding matrice if needed - - Args: - num_special_tokens: Special tokens to be added to the embedding matrix - - TODO Lysandre filled Args - - """ - if num_special_tokens is None or self.config.n_special == num_special_tokens: - return - # Update config - self.config.n_special = num_special_tokens - # Build new embeddings and initialize all new embeddings (in particular the special tokens) - old_embed = self.tokens_embed - self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd) - self.tokens_embed.to(old_embed.weight.device) - self.init_weights(self.tokens_embed) - # Copy word embeddings from the previous weights - self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :] + def _resize_token_embeddings(self, new_num_tokens): + self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens) def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. @@ -657,24 +574,20 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): def __init__(self, config): super(OpenAIGPTLMHeadModel, self).__init__(config) self.transformer = OpenAIGPTModel(config) - self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + self.apply(self.init_weights) + self.tie_weights() - def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. """ - Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings - - Args: - num_special_tokens: Special tokens to be added to the embedding matrix - predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``. - Defaults to True. - - TODO Lysandre filled Args - - """ - self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens - self.transformer.set_num_special_tokens(num_special_tokens) - self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens) + input_embeddings = self.transformer.tokens_embed.weight + if self.config.torchscript: + self.lm_head.weight = nn.Parameter(input_embeddings.clone()) + else: + self.lm_head.weight = input_embeddings # Tied weights def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None): """ @@ -747,13 +660,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): config.vocab_size - 1, ______________________ config.vocab_size, ... -> special embeddings - config.vocab_size + config.n_special - 1] ______________________ + config.vocab_size + n_special - 1] ______________________ - where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is: + where ``total_tokens_embeddings`` is: :: - total_tokens_embeddings = config.vocab_size + config.n_special + total_tokens_embeddings = config.vocab_size + .n_special You should use the associate indices to index the embeddings. @@ -773,24 +686,21 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): super(OpenAIGPTDoubleHeadsModel, self).__init__(config) self.transformer = OpenAIGPTModel(config) - self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.multiple_choice_head = SequenceSummary(config) self.apply(self.init_weights) + self.tie_weights() - def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): - """ Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings. - - Args: - num_special_tokens: Special tokens to be added to the embedding matrix - predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``. - Defaults to True. - - TODO Lysandre filled Args + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. """ - self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens - self.transformer.set_num_special_tokens(num_special_tokens) - self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens) + input_embeddings = self.transformer.tokens_embed.weight + if self.config.torchscript: + self.lm_head.weight = nn.Parameter(input_embeddings.clone()) + else: + self.lm_head.weight = input_embeddings # Tied weights def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None, head_mask=None): diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py index b194f43b68..71f80a9eea 100644 --- a/pytorch_transformers/modeling_transfo_xl.py +++ b/pytorch_transformers/modeling_transfo_xl.py @@ -287,6 +287,10 @@ class TransfoXLConfig(PretrainedConfig): raise ValueError("First argument must be either a vocabulary size (int)" "or the path to a pretrained model config file (str)") + @property + def vocab_size(self): + return self.n_token + @property def hidden_size(self): return self.d_model @@ -998,6 +1002,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel): self.apply(self.init_weights) + def _resize_token_embeddings(self, new_num_tokens): + raise NotImplementedError + def backward_compatible(self): self.sample_softmax = -1 diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py index a9445ecad5..8fdfda4720 100644 --- a/pytorch_transformers/modeling_utils.py +++ b/pytorch_transformers/modeling_utils.py @@ -151,6 +151,7 @@ class PreTrainedModel(nn.Module): pretrained_model_archive_map = {} load_tf_weights = lambda model, config, path: None base_model_prefix = "" + input_embeddings = None def __init__(self, config, *inputs, **kwargs): super(PreTrainedModel, self).__init__() @@ -164,12 +165,48 @@ class PreTrainedModel(nn.Module): # Save config in model self.config = config + def _get_resized_embeddings(self, old_embeddings, new_num_tokens): + # Build new embeddings + old_num_tokens, old_embedding_dim = old_embeddings.weight.size() + new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim) + new_embeddings.to(old_embeddings.weight.device) + + # initialize all new embeddings (in particular added tokens) + self.init_weights(new_embeddings) + + # Copy word embeddings from the previous weights + num_tokens_to_copy = min(old_num_tokens, new_num_tokens) + new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :] + + return new_embeddings + + def resize_token_embeddings(self, new_num_tokens): + """ Resize input token embeddings matrix. + + Args: + new_num_tokens: New number of tokens in the embedding matrix. + Increasing the size will add newly initialized vectors at the end + Reducing the size will remove vectors from the end + """ + if new_num_tokens == self.config.vocab_size: + return + base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed + base_model._resize_token_embeddings(new_num_tokens) + + # Update base model and current model config + self.config.vocab_size = new_num_tokens + base_model.vocab_size = new_num_tokens + + # Tie weights again if needed + if hasattr(self, 'tie_weights'): + self.tie_weights() + def prune_heads(self, heads_to_prune): """ Prunes heads of the base model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} """ - model_to_prune = getattr(self, self.base_model_prefix, self) # get the base model if needed - model_to_prune._prune_heads(heads_to_prune) + base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed + base_model._prune_heads(heads_to_prune) def save_pretrained(self, save_directory): """ Save a model with its configuration file to a directory, so that it diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py index 7567a0f24b..3d5b35fae6 100644 --- a/pytorch_transformers/modeling_xlm.py +++ b/pytorch_transformers/modeling_xlm.py @@ -104,7 +104,6 @@ class XLMConfig(PretrainedConfig): def __init__(self, vocab_size_or_config_json_file=30145, - n_special=0, emb_dim=2048, n_layers=12, n_heads=16, @@ -148,7 +147,6 @@ class XLMConfig(PretrainedConfig): self.__dict__[key] = value elif isinstance(vocab_size_or_config_json_file, int): self.n_words = vocab_size_or_config_json_file - self.n_special = n_special self.emb_dim = emb_dim self.n_layers = n_layers self.n_heads = n_heads @@ -183,8 +181,8 @@ class XLMConfig(PretrainedConfig): "or the path to a pretrained model config file (str)") @property - def total_tokens_embeddings(self): - return self.n_words + self.n_special + def vocab_size(self): + return self.n_words @property def hidden_size(self): @@ -479,6 +477,9 @@ class XLMModel(XLMPreTrainedModel): self.apply(self.init_weights) + def _resize_token_embeddings(self, new_num_tokens): + self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens) + def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} @@ -718,8 +719,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): """ def __init__(self, config): super(XLMWithLMHeadModel, self).__init__(config) - self.torchscript = config.torchscript - self.transformer = XLMModel(config) self.pred_layer = XLMPredLayer(config) @@ -729,7 +728,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): def tie_weights(self): """ Make sure we are sharing the embeddings """ - if self.torchscript: + if self.config.torchscript: self.pred_layer.proj.weight = nn.Parameter(self.transformer.embeddings.weight.clone()) else: self.pred_layer.proj.weight = self.transformer.embeddings.weight diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py index a5f95957c3..36c068e3a3 100644 --- a/pytorch_transformers/modeling_xlnet.py +++ b/pytorch_transformers/modeling_xlnet.py @@ -312,6 +312,10 @@ class XLNetConfig(PretrainedConfig): raise ValueError("First argument must be either a vocabulary size (int)" "or the path to a pretrained model config file (str)") + @property + def vocab_size(self): + return self.n_token + @property def hidden_size(self): return self.d_model @@ -654,6 +658,9 @@ class XLNetModel(XLNetPreTrainedModel): self.apply(self.init_weights) + def _resize_token_embeddings(self, new_num_tokens): + self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens) + def _prune_heads(self, heads_to_prune): logger.info("Head pruning is not implemented for XLNet") pass @@ -970,20 +977,17 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): super(XLNetLMHeadModel, self).__init__(config) self.attn_type = config.attn_type self.same_length = config.same_length - self.torchscript = config.torchscript self.transformer = XLNetModel(config) self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True) - # Tie weights - self.apply(self.init_weights) self.tie_weights() def tie_weights(self): """ Make sure we are sharing the embeddings """ - if self.torchscript: + if self.config.torchscript: self.lm_loss.weight = nn.Parameter(self.transformer.word_embedding.weight.clone()) else: self.lm_loss.weight = self.transformer.word_embedding.weight diff --git a/pytorch_transformers/tests/modeling_bert_test.py b/pytorch_transformers/tests/modeling_bert_test.py index fbdce29366..4ab0c9d157 100644 --- a/pytorch_transformers/tests/modeling_bert_test.py +++ b/pytorch_transformers/tests/modeling_bert_test.py @@ -26,7 +26,7 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM, BertForTokenClassification, BertForMultipleChoice) from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor) +from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor) class BertModelTest(unittest.TestCase): diff --git a/pytorch_transformers/tests/modeling_tests_commons.py b/pytorch_transformers/tests/modeling_common_test.py similarity index 91% rename from pytorch_transformers/tests/modeling_tests_commons.py rename to pytorch_transformers/tests/modeling_common_test.py index 5535177aaa..98849216fa 100644 --- a/pytorch_transformers/tests/modeling_tests_commons.py +++ b/pytorch_transformers/tests/modeling_common_test.py @@ -22,8 +22,15 @@ import shutil import json import random +import unittest +import logging + import torch +from pytorch_transformers import PretrainedConfig, PreTrainedModel +from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP + + def _config_zero_init(config): configs_no_init = copy.deepcopy(config) for key in configs_no_init.__dict__.keys(): @@ -242,6 +249,7 @@ class ConfigTester(object): def create_and_test_config_common_properties(self): config = self.config_class(**self.inputs_dict) + self.parent.assertTrue(hasattr(config, 'vocab_size')) self.parent.assertTrue(hasattr(config, 'hidden_size')) self.parent.assertTrue(hasattr(config, 'num_attention_heads')) self.parent.assertTrue(hasattr(config, 'num_hidden_layers')) @@ -276,7 +284,6 @@ class GPTModelTester(object): use_token_type_ids=True, use_labels=True, vocab_size=99, - n_special=1, n_positions=33, hidden_size=32, num_hidden_layers=5, @@ -299,7 +306,6 @@ class GPTModelTester(object): self.use_token_type_ids = use_token_type_ids self.use_labels = use_labels self.vocab_size = vocab_size - self.n_special = n_special self.n_positions = n_positions self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers @@ -316,7 +322,7 @@ class GPTModelTester(object): self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class) def prepare_config_and_inputs(self): - total_num_tokens = self.vocab_size + self.n_special + total_num_tokens = self.vocab_size input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens) position_ids = None @@ -338,7 +344,6 @@ class GPTModelTester(object): config = self.config_class( vocab_size_or_config_json_file=self.vocab_size, - n_special=self.n_special, n_positions=self.n_positions, n_embd=self.hidden_size, n_layer=self.num_hidden_layers, @@ -370,7 +375,7 @@ class GPTModelTester(object): outputs = model(input_ids, position_ids, token_type_ids, lm_labels) loss, lm_logits = outputs[:2] - total_voc = self.n_special + self.vocab_size + total_voc = self.vocab_size self.parent.assertListEqual( list(lm_logits.size()), [self.batch_size, self.n_choices, self.seq_length, total_voc]) @@ -400,7 +405,7 @@ class GPTModelTester(object): lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4] loss = [lm_loss, mc_loss] - total_voc = self.n_special + self.vocab_size + total_voc = self.vocab_size self.parent.assertListEqual( list(lm_logits.size()), [self.batch_size, self.n_choices, self.seq_length, total_voc]) @@ -441,6 +446,30 @@ class GPTModelTester(object): self.create_and_check_commons(*config_and_inputs) def run_slow_tests(self): - config_and_inputs = self.prepare_config_and_inputs() - self.create_and_check_model_from_pretrained(*config_and_inputs) + self.create_and_check_model_from_pretrained() + +class ModelUtilsTest(unittest.TestCase): + def test_model_from_pretrained(self): + logging.basicConfig(level=logging.INFO) + for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: + config = BertConfig.from_pretrained(model_name) + self.assertIsNotNone(config) + self.assertIsInstance(config, PretrainedConfig) + + model = BertModel.from_pretrained(model_name) + model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True) + self.assertIsNotNone(model) + self.assertIsInstance(model, PreTrainedModel) + for value in loading_info.values(): + self.assertEqual(len(value), 0) + + config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True) + model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True) + self.assertEqual(model.config.output_attentions, True) + self.assertEqual(model.config.output_hidden_states, True) + self.assertEqual(model.config, config) + + +if __name__ == "__main__": + unittest.main() diff --git a/pytorch_transformers/tests/modeling_gpt2_test.py b/pytorch_transformers/tests/modeling_gpt2_test.py index 7400c9f64d..00a9cb4614 100644 --- a/pytorch_transformers/tests/modeling_gpt2_test.py +++ b/pytorch_transformers/tests/modeling_gpt2_test.py @@ -28,7 +28,7 @@ import torch from pytorch_transformers import (GPT2Config, GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) -from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester) +from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester) class GPT2ModelTest(unittest.TestCase): diff --git a/pytorch_transformers/tests/modeling_openai_test.py b/pytorch_transformers/tests/modeling_openai_test.py index 27263ecb24..4f57f4661b 100644 --- a/pytorch_transformers/tests/modeling_openai_test.py +++ b/pytorch_transformers/tests/modeling_openai_test.py @@ -24,7 +24,7 @@ import torch from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) -from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester) +from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester) class OpenAIModelTest(unittest.TestCase): diff --git a/pytorch_transformers/tests/modeling_transfo_xl_test.py b/pytorch_transformers/tests/modeling_transfo_xl_test.py index 49ba1addf1..9631cd6034 100644 --- a/pytorch_transformers/tests/modeling_transfo_xl_test.py +++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py @@ -28,7 +28,7 @@ import torch from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel) from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor +from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor class TransfoXLModelTest(unittest.TestCase): class TransfoXLModelTester(object): diff --git a/pytorch_transformers/tests/modeling_utils_test.py b/pytorch_transformers/tests/modeling_utils_test.py deleted file mode 100644 index 4944f41228..0000000000 --- a/pytorch_transformers/tests/modeling_utils_test.py +++ /dev/null @@ -1,47 +0,0 @@ -# coding=utf-8 -# Copyright 2018 HuggingFace Inc.. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import unittest -import logging - -from pytorch_transformers import PretrainedConfig, PreTrainedModel -from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP - -class ModelUtilsTest(unittest.TestCase): - def test_model_from_pretrained(self): - logging.basicConfig(level=logging.INFO) - for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - config = BertConfig.from_pretrained(model_name) - self.assertIsNotNone(config) - self.assertIsInstance(config, PretrainedConfig) - - model = BertModel.from_pretrained(model_name) - model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True) - self.assertIsNotNone(model) - self.assertIsInstance(model, PreTrainedModel) - for value in loading_info.values(): - self.assertEqual(len(value), 0) - - config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True) - model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True) - self.assertEqual(model.config.output_attentions, True) - self.assertEqual(model.config.output_hidden_states, True) - self.assertEqual(model.config, config) - -if __name__ == "__main__": - unittest.main() diff --git a/pytorch_transformers/tests/modeling_xlm_test.py b/pytorch_transformers/tests/modeling_xlm_test.py index 6e2e082d19..9d6bc4054d 100644 --- a/pytorch_transformers/tests/modeling_xlm_test.py +++ b/pytorch_transformers/tests/modeling_xlm_test.py @@ -23,7 +23,7 @@ import pytest from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification) from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor) +from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor) class XLMModelTest(unittest.TestCase): diff --git a/pytorch_transformers/tests/modeling_xlnet_test.py b/pytorch_transformers/tests/modeling_xlnet_test.py index e167e2d2e8..41c114ce9c 100644 --- a/pytorch_transformers/tests/modeling_xlnet_test.py +++ b/pytorch_transformers/tests/modeling_xlnet_test.py @@ -28,7 +28,7 @@ import torch from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering) from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor +from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor class XLNetModelTest(unittest.TestCase): class XLNetModelTester(object):