embeddings resizing + tie_weights

2019-07-12 00:02:49 +02:00
parent 50e62a4cb4
commit bd404735a7
15 changed files with 196 additions and 332 deletions
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -507,23 +507,17 @@ class BertPredictionHeadTransform(nn.Module):
 class BertLMPredictionHead(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
+    def __init__(self, config):
        super(BertLMPredictionHead, self).__init__()
        self.transform = BertPredictionHeadTransform(config)
        self.torchscript = config.torchscript
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
-        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
+        self.decoder = nn.Linear(config.hidden_size,
-                                 bert_model_embedding_weights.size(0),
+                                 config.vocab_size,
                                 bias=False)
-        if self.torchscript:
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
            self.decoder.weight = nn.Parameter(bert_model_embedding_weights.clone())
        else:
            self.decoder.weight = bert_model_embedding_weights
        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
@@ -532,9 +526,9 @@ class BertLMPredictionHead(nn.Module):
 class BertOnlyMLMHead(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
+    def __init__(self, config):
        super(BertOnlyMLMHead, self).__init__()
-        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+        self.predictions = BertLMPredictionHead(config)
    def forward(self, sequence_output):
        prediction_scores = self.predictions(sequence_output)
@@ -552,9 +546,9 @@ class BertOnlyNSPHead(nn.Module):
 class BertPreTrainingHeads(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
+    def __init__(self, config):
        super(BertPreTrainingHeads, self).__init__()
-        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+        self.predictions = BertLMPredictionHead(config)
        self.seq_relationship = nn.Linear(config.hidden_size, 2)
    def forward(self, sequence_output, pooled_output):
@@ -619,6 +613,11 @@ class BertModel(BertPreTrainedModel):
        self.apply(self.init_weights)
    def _resize_token_embeddings(self, new_num_tokens):
        old_embeddings = self.embeddings.word_embeddings
        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
        self.embeddings.word_embeddings = new_embeddings
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
@@ -750,9 +749,20 @@ class BertForPreTraining(BertPreTrainedModel):
        super(BertForPreTraining, self).__init__(config)
        self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
+        self.cls = BertPreTrainingHeads(config)
        self.apply(self.init_weights)
        self.tie_weights()
    def tie_weights(self):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
        input_embeddings = self.bert.embeddings.word_embeddings.weight
        if self.config.torchscript:
            self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
        else:
            self.cls.predictions.decoder.weight = input_embeddings  # Tied weights
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
                next_sentence_label=None, head_mask=None):
@@ -845,9 +855,20 @@ class BertForMaskedLM(BertPreTrainedModel):
        super(BertForMaskedLM, self).__init__(config)
        self.bert = BertModel(config)
-        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
+        self.cls = BertOnlyMLMHead(config)
        self.apply(self.init_weights)
        self.tie_weights()
    def tie_weights(self):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
        input_embeddings = self.bert.embeddings.word_embeddings.weight
        if self.config.torchscript:
            self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
        else:
            self.cls.predictions.decoder.weight = input_embeddings  # Tied weights
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
        """
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -104,7 +104,6 @@ class GPT2Config(PretrainedConfig):
    Args:
        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
        n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
        n_positions: Number of positional embeddings.
        n_ctx: Size of the causal mask (usually same as n_positions).
        n_embd: Dimensionality of the embeddings and hidden states.
@@ -119,14 +118,12 @@ class GPT2Config(PretrainedConfig):
        embd_pdrop: The dropout ratio for the embeddings.
        initializer_range: The sttdev of the truncated_normal_initializer for
            initializing all weight matrices.
        predict_special_tokens: should we predict special tokens (when the model has a LM head)
    """
    pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
    def __init__(
        self,
        vocab_size_or_config_json_file=50257,
        n_special=0,
        n_positions=1024,
        n_ctx=1024,
        n_embd=768,
@@ -137,7 +134,6 @@ class GPT2Config(PretrainedConfig):
        attn_pdrop=0.1,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        predict_special_tokens=True,
        num_labels=1,
        summary_type='token_ids',
@@ -151,7 +147,6 @@ class GPT2Config(PretrainedConfig):
        Args:
            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
            n_positions: Number of positional embeddings.
            n_ctx: Size of the causal mask (usually same as n_positions).
            n_embd: Dimensionality of the embeddings and hidden states.
@@ -166,7 +161,6 @@ class GPT2Config(PretrainedConfig):
            embd_pdrop: The dropout ratio for the embeddings.
            initializer_range: The sttdev of the truncated_normal_initializer for
                initializing all weight matrices.
            predict_special_tokens: should we predict special tokens (when the model has a LM head)
        """
        super(GPT2Config, self).__init__(**kwargs)
@@ -178,7 +172,6 @@ class GPT2Config(PretrainedConfig):
                self.__dict__[key] = value
        elif isinstance(vocab_size_or_config_json_file, int):
            self.vocab_size = vocab_size_or_config_json_file
            self.n_special = n_special
            self.n_ctx = n_ctx
            self.n_positions = n_positions
            self.n_embd = n_embd
@@ -189,7 +182,6 @@ class GPT2Config(PretrainedConfig):
            self.attn_pdrop = attn_pdrop
            self.layer_norm_epsilon = layer_norm_epsilon
            self.initializer_range = initializer_range
            self.predict_special_tokens = predict_special_tokens
            self.num_labels = num_labels
            self.summary_type = summary_type
@@ -203,10 +195,6 @@ class GPT2Config(PretrainedConfig):
                "or the path to a pretrained model config file (str)"
            )
    @property
    def total_tokens_embeddings(self):
        return self.vocab_size + self.n_special
    @property
    def hidden_size(self):
        return self.n_embd
@@ -347,34 +335,6 @@ class Block(nn.Module):
        return outputs  # x, present, (attentions)
 class GPT2LMHead(nn.Module):
    """ Language Model Head for the transformer """
    def __init__(self, model_embeddings_weights, config):
        super(GPT2LMHead, self).__init__()
        self.n_embd = config.n_embd
        self.vocab_size = config.vocab_size
        self.predict_special_tokens = config.predict_special_tokens
        self.torchscript = config.torchscript
        embed_shape = model_embeddings_weights.shape
        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
        self.set_embeddings_weights(model_embeddings_weights)
    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
        self.predict_special_tokens = predict_special_tokens
        # Export to TorchScript can't handle parameter sharing so we are cloning them.
        if self.torchscript:
            self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())
        else:
            self.decoder.weight = model_embeddings_weights  # Tied weights
    def forward(self, hidden_state):
        lm_logits = self.decoder(hidden_state)
        if not self.predict_special_tokens:
            lm_logits = lm_logits[..., :self.vocab_size]
        return lm_logits
 class GPT2PreTrainedModel(PreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
@@ -400,36 +360,6 @@ class GPT2PreTrainedModel(PreTrainedModel):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
        """
        Instantiate a GPT2PreTrainedModel from a pre-trained model file or a pytorch state dict.
        Download and cache the pre-trained model file if needed.
        Params:
            pretrained_model_name_or_path: either:
                - a str with the name of a pre-trained model to load selected in the list of:
                    . `gpt2`
                - a path or url to a pretrained model archive containing:
                    . `gpt2_config.json` a configuration file for the model
                    . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
                - a path or url to a pretrained model archive containing:
                    . `gpt2_config.json` a configuration file for the model
                    . a TensorFlow checkpoint with trained weights
            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
            state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
            *inputs, **kwargs: additional input for the specific GPT2 class
        """
        num_special_tokens = kwargs.pop('num_special_tokens', None)
        model = super().from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
        # Add additional embeddings for special tokens if needed
        # This step also make sure we are still sharing the output and input embeddings after loading weights
        model.set_num_special_tokens(num_special_tokens)
        return model
 class GPT2Model(GPT2PreTrainedModel):
    """OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners").
@@ -447,13 +377,13 @@ class GPT2Model(GPT2PreTrainedModel):
         config.vocab_size - 1,                                     ______________________
         config.vocab_size,
         ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
+         config.vocab_size + n_special - 1]                  ______________________
-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is equal to
+    where total_tokens_embeddings is equal to
    ::
-        total_tokens_embeddings = config.vocab_size + config.n_special
+        total_tokens_embeddings = vocab_size + n_special
    You should use the associated indices to index the embeddings.
@@ -474,7 +404,7 @@ class GPT2Model(GPT2PreTrainedModel):
        self.output_hidden_states = config.output_hidden_states
        self.output_attentions = config.output_attentions
-        self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
        self.drop = nn.Dropout(config.embd_pdrop)
        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
@@ -482,26 +412,8 @@ class GPT2Model(GPT2PreTrainedModel):
        self.apply(self.init_weights)
-    def set_num_special_tokens(self, num_special_tokens=None):
+    def _resize_token_embeddings(self, new_num_tokens):
-        """
+        self.wte = self._get_resized_embeddings(self.wte, new_num_tokens)
        Update input embeddings with new embedding matrix if needed.
        Args:
            num_special_tokens: Special tokens to be added to the embedding matrix
        TODO Lysandre filled args
        """
        if num_special_tokens is None or self.config.n_special == num_special_tokens:
            return
        # Update config
        self.config.n_special = num_special_tokens
        # Build new embeddings and initialize all new embeddings (in particular the special tokens)
        old_embed = self.wte
        self.wte = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
        self.wte.to(old_embed.weight.device)
        self.init_weights(self.wte)
        # Copy word embeddings from the previous weights
        self.wte.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
@@ -641,23 +553,20 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
    def __init__(self, config):
        super(GPT2LMHeadModel, self).__init__(config)
        self.transformer = GPT2Model(config)
-        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.apply(self.init_weights)
        self.tie_weights()
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+    def tie_weights(self):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
-        Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
+        input_embeddings = self.transformer.wte.weight
-
+        if self.config.torchscript:
-        Args:
+            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
-            num_special_tokens: Special tokens to be added to the embedding matrix
+        else:
-            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
+            self.lm_head.weight = input_embeddings  # Tied weights
                Defaults to True.
        TODO Lysandre filled args
        """
        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
        self.transformer.set_num_special_tokens(num_special_tokens)
        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
        """
@@ -740,25 +649,20 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
    def __init__(self, config):
        super(GPT2DoubleHeadsModel, self).__init__(config)
        self.transformer = GPT2Model(config)
-        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.multiple_choice_head = SequenceSummary(config)
        self.apply(self.init_weights)
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+    def tie_weights(self):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
-        Update input and output embeddings with new embedding matrix.Make sure we are sharing the embeddings
+        input_embeddings = self.transformer.wte.weight
-
+        if self.config.torchscript:
-        Args:
+            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
-            num_special_tokens: Special tokens to be added to the embedding matrix
+        else:
-            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
+            self.lm_head.weight = input_embeddings  # Tied weights
                Defaults to True.
        TODO Lysandre filled args
        """
        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
        self.transformer.set_num_special_tokens(num_special_tokens)
        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                position_ids=None, past=None, head_mask=None):
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -156,7 +156,6 @@ class OpenAIGPTConfig(PretrainedConfig):
    def __init__(
        self,
        vocab_size_or_config_json_file=40478,
        n_special=0,
        n_positions=512,
        n_ctx=512,
        n_embd=768,
@@ -190,7 +189,6 @@ class OpenAIGPTConfig(PretrainedConfig):
                self.__dict__[key] = value
        elif isinstance(vocab_size_or_config_json_file, int):
            self.vocab_size = vocab_size_or_config_json_file
            self.n_special = n_special
            self.n_ctx = n_ctx
            self.n_positions = n_positions
            self.n_embd = n_embd
@@ -216,10 +214,6 @@ class OpenAIGPTConfig(PretrainedConfig):
                "or the path to a pretrained model config file (str)"
            )
    @property
    def total_tokens_embeddings(self):
        return self.vocab_size + self.n_special
    @property
    def hidden_size(self):
        return self.n_embd
@@ -355,34 +349,6 @@ class Block(nn.Module):
        return outputs
 class OpenAIGPTLMHead(nn.Module):
    """ Language Model Head for the transformer """
    def __init__(self, model_embeddings_weights, config):
        super(OpenAIGPTLMHead, self).__init__()
        self.n_embd = config.n_embd
        self.vocab_size = config.vocab_size
        self.predict_special_tokens = config.predict_special_tokens
        self.torchscript = config.torchscript
        embed_shape = model_embeddings_weights.shape
        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
        self.set_embeddings_weights(model_embeddings_weights)
    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
        self.predict_special_tokens = predict_special_tokens
        if self.torchscript:
            self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())
        else:
            self.decoder.weight = model_embeddings_weights  # Tied weights
    def forward(self, hidden_state):
        lm_logits = self.decoder(hidden_state)
        if not self.predict_special_tokens:
            lm_logits = lm_logits[..., :self.vocab_size]
        return lm_logits
 class OpenAIGPTPreTrainedModel(PreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
@@ -408,36 +374,6 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
        """
        Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
        Download and cache the pre-trained model file if needed.
        Params:
            pretrained_model_name_or_path: either:
                - a str with the name of a pre-trained model to load selected in the list of:
                - a path or url to a pretrained model archive containing:
                    . `config.json` a configuration file for the model
                    . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
                - a path or url to a pretrained model archive containing:
                    . `config.json` a configuration file for the model
                    . a series of NumPy files containing OpenAI TensorFlow trained weights
            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
            *inputs, **kwargs: additional input for the specific OpenAI-GPT class
        """
        num_special_tokens = kwargs.get('num_special_tokens', None)
        kwargs.pop('num_special_tokens', None)
        model = super(PreTrainedModel, cls).from_pretrained(pretrained_model_name_or_path, pretrained_model_name_or_path, *inputs, **kwargs)
        # Add additional embeddings for special tokens if needed
        # This step also make sure we are still sharing the output and input embeddings after loading weights
        model.set_num_special_tokens(num_special_tokens)
        return model
 class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
    """OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
@@ -457,13 +393,13 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         config.vocab_size - 1,                                     ______________________
         config.vocab_size,
         ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
+         config.vocab_size + n_special - 1]                  ______________________
-    where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
+    where ``total_tokens_embeddings``  is:
    ::
-        total_tokens_embeddings = config.vocab_size + config.n_special
+        total_tokens_embeddings = config.vocab_size + n_special
    You should use the associated indices to index the embeddings.
@@ -485,34 +421,15 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
-        self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
+        self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd)
        self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
        self.drop = nn.Dropout(config.embd_pdrop)
        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
        self.apply(self.init_weights)
-    def set_num_special_tokens(self, num_special_tokens=None):
+    def _resize_token_embeddings(self, new_num_tokens):
-        """
+        self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens)
        Update input embeddings with new embedding matrice if needed
        Args:
            num_special_tokens: Special tokens to be added to the embedding matrix
        TODO Lysandre filled Args
        """
        if num_special_tokens is None or self.config.n_special == num_special_tokens:
            return
        # Update config
        self.config.n_special = num_special_tokens
        # Build new embeddings and initialize all new embeddings (in particular the special tokens)
        old_embed = self.tokens_embed
        self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
        self.tokens_embed.to(old_embed.weight.device)
        self.init_weights(self.tokens_embed)
        # Copy word embeddings from the previous weights
        self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
@@ -657,24 +574,20 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
    def __init__(self, config):
        super(OpenAIGPTLMHeadModel, self).__init__(config)
        self.transformer = OpenAIGPTModel(config)
-        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.apply(self.init_weights)
        self.tie_weights()
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+    def tie_weights(self):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
-        Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings
+        input_embeddings = self.transformer.tokens_embed.weight
-
+        if self.config.torchscript:
-        Args:
+            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
-            num_special_tokens: Special tokens to be added to the embedding matrix
+        else:
-            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
+            self.lm_head.weight = input_embeddings  # Tied weights
                Defaults to True.
        TODO Lysandre filled Args
        """
        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
        self.transformer.set_num_special_tokens(num_special_tokens)
        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
        """
@@ -747,13 +660,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         config.vocab_size - 1,                                     ______________________
         config.vocab_size,
         ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
+         config.vocab_size + n_special - 1]                  ______________________
-    where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
+    where ``total_tokens_embeddings`` is:
    ::
-        total_tokens_embeddings = config.vocab_size + config.n_special
+        total_tokens_embeddings = config.vocab_size + .n_special
    You should use the associate indices to index the embeddings.
@@ -773,24 +686,21 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
        super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
        self.transformer = OpenAIGPTModel(config)
-        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.multiple_choice_head = SequenceSummary(config)
        self.apply(self.init_weights)
        self.tie_weights()
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+    def tie_weights(self):
-        """ Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
+        """ Make sure we are sharing the input and output embeddings.
-
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        Args:
            num_special_tokens: Special tokens to be added to the embedding matrix
            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
                Defaults to True.
        TODO Lysandre filled Args
        """
-        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
+        input_embeddings = self.transformer.tokens_embed.weight
-        self.transformer.set_num_special_tokens(num_special_tokens)
+        if self.config.torchscript:
-        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
+            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
        else:
            self.lm_head.weight = input_embeddings  # Tied weights
    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                position_ids=None, head_mask=None):
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -287,6 +287,10 @@ class TransfoXLConfig(PretrainedConfig):
            raise ValueError("First argument must be either a vocabulary size (int)"
                             "or the path to a pretrained model config file (str)")
    @property
    def vocab_size(self):
        return self.n_token
    @property
    def hidden_size(self):
        return self.d_model
@@ -998,6 +1002,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
        self.apply(self.init_weights)
    def _resize_token_embeddings(self, new_num_tokens):
        raise NotImplementedError
    def backward_compatible(self):
        self.sample_softmax = -1
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -151,6 +151,7 @@ class PreTrainedModel(nn.Module):
    pretrained_model_archive_map = {}
    load_tf_weights = lambda model, config, path: None
    base_model_prefix = ""
    input_embeddings = None
    def __init__(self, config, *inputs, **kwargs):
        super(PreTrainedModel, self).__init__()
@@ -164,12 +165,48 @@ class PreTrainedModel(nn.Module):
        # Save config in model
        self.config = config
    def _get_resized_embeddings(self, old_embeddings, new_num_tokens):
        # Build new embeddings
        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
        new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
        new_embeddings.to(old_embeddings.weight.device)
        # initialize all new embeddings (in particular added tokens)
        self.init_weights(new_embeddings)
        # Copy word embeddings from the previous weights
        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
        new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
        return new_embeddings
    def resize_token_embeddings(self, new_num_tokens):
        """ Resize input token embeddings matrix.
        Args:
            new_num_tokens: New number of tokens in the embedding matrix.
                Increasing the size will add newly initialized vectors at the end
                Reducing the size will remove vectors from the end
        """
        if new_num_tokens == self.config.vocab_size:
            return
        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
        base_model._resize_token_embeddings(new_num_tokens)
        # Update base model and current model config
        self.config.vocab_size = new_num_tokens
        base_model.vocab_size = new_num_tokens
        # Tie weights again if needed
        if hasattr(self, 'tie_weights'):
            self.tie_weights()
    def prune_heads(self, heads_to_prune):
        """ Prunes heads of the base model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
-        model_to_prune = getattr(self, self.base_model_prefix, self)  # get the base model if needed
+        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
-        model_to_prune._prune_heads(heads_to_prune)
+        base_model._prune_heads(heads_to_prune)
    def save_pretrained(self, save_directory):
        """ Save a model with its configuration file to a directory, so that it
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -104,7 +104,6 @@ class XLMConfig(PretrainedConfig):
    def __init__(self,
                 vocab_size_or_config_json_file=30145,
                 n_special=0,
                 emb_dim=2048,
                 n_layers=12,
                 n_heads=16,
@@ -148,7 +147,6 @@ class XLMConfig(PretrainedConfig):
                self.__dict__[key] = value
        elif isinstance(vocab_size_or_config_json_file, int):
            self.n_words = vocab_size_or_config_json_file
            self.n_special = n_special
            self.emb_dim = emb_dim
            self.n_layers = n_layers
            self.n_heads = n_heads
@@ -183,8 +181,8 @@ class XLMConfig(PretrainedConfig):
                             "or the path to a pretrained model config file (str)")
    @property
-    def total_tokens_embeddings(self):
+    def vocab_size(self):
-        return self.n_words + self.n_special
+        return self.n_words
    @property
    def hidden_size(self):
@@ -479,6 +477,9 @@ class XLMModel(XLMPreTrainedModel):
        self.apply(self.init_weights)
    def _resize_token_embeddings(self, new_num_tokens):
        self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
@@ -718,8 +719,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
    """
    def __init__(self, config):
        super(XLMWithLMHeadModel, self).__init__(config)
        self.torchscript = config.torchscript
        self.transformer = XLMModel(config)
        self.pred_layer = XLMPredLayer(config)
@@ -729,7 +728,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
    def tie_weights(self):
        """ Make sure we are sharing the embeddings
        """
-        if self.torchscript:
+        if self.config.torchscript:
            self.pred_layer.proj.weight = nn.Parameter(self.transformer.embeddings.weight.clone())
        else:
            self.pred_layer.proj.weight = self.transformer.embeddings.weight
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -312,6 +312,10 @@ class XLNetConfig(PretrainedConfig):
            raise ValueError("First argument must be either a vocabulary size (int)"
                             "or the path to a pretrained model config file (str)")
    @property
    def vocab_size(self):
        return self.n_token
    @property
    def hidden_size(self):
        return self.d_model
@@ -654,6 +658,9 @@ class XLNetModel(XLNetPreTrainedModel):
        self.apply(self.init_weights)
    def _resize_token_embeddings(self, new_num_tokens):
        self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
    def _prune_heads(self, heads_to_prune):
        logger.info("Head pruning is not implemented for XLNet")
        pass
@@ -970,20 +977,17 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
        super(XLNetLMHeadModel, self).__init__(config)
        self.attn_type = config.attn_type
        self.same_length = config.same_length
        self.torchscript = config.torchscript
        self.transformer = XLNetModel(config)
        self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
        # Tie weights
        self.apply(self.init_weights)
        self.tie_weights()
    def tie_weights(self):
        """ Make sure we are sharing the embeddings
        """
-        if self.torchscript:
+        if self.config.torchscript:
            self.lm_loss.weight = nn.Parameter(self.transformer.word_embedding.weight.clone())
        else:
            self.lm_loss.weight = self.transformer.word_embedding.weight
--- a/pytorch_transformers/tests/modeling_bert_test.py
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -26,7 +26,7 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
                                     BertForTokenClassification, BertForMultipleChoice)
 from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
+from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor)
 class BertModelTest(unittest.TestCase):
--- a/pytorch_transformers/tests/modeling_tests_commons.py
+++ b/pytorch_transformers/tests/modeling_tests_commons.py
@@ -22,8 +22,15 @@ import shutil
 import json
 import random
 import unittest
 import logging
 import torch
 from pytorch_transformers import PretrainedConfig, PreTrainedModel
 from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 def _config_zero_init(config):
    configs_no_init = copy.deepcopy(config)
    for key in configs_no_init.__dict__.keys():
@@ -242,6 +249,7 @@ class ConfigTester(object):
    def create_and_test_config_common_properties(self):
        config = self.config_class(**self.inputs_dict)
        self.parent.assertTrue(hasattr(config, 'vocab_size'))
        self.parent.assertTrue(hasattr(config, 'hidden_size'))
        self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
        self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
@@ -276,7 +284,6 @@ class GPTModelTester(object):
                    use_token_type_ids=True,
                    use_labels=True,
                    vocab_size=99,
                    n_special=1,
                    n_positions=33,
                    hidden_size=32,
                    num_hidden_layers=5,
@@ -299,7 +306,6 @@ class GPTModelTester(object):
        self.use_token_type_ids = use_token_type_ids
        self.use_labels = use_labels
        self.vocab_size = vocab_size
        self.n_special = n_special
        self.n_positions = n_positions
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
@@ -316,7 +322,7 @@ class GPTModelTester(object):
        self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class)
    def prepare_config_and_inputs(self):
-        total_num_tokens = self.vocab_size + self.n_special
+        total_num_tokens = self.vocab_size
        input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
        position_ids = None
@@ -338,7 +344,6 @@ class GPTModelTester(object):
        config = self.config_class(
            vocab_size_or_config_json_file=self.vocab_size,
            n_special=self.n_special,
            n_positions=self.n_positions,
            n_embd=self.hidden_size,
            n_layer=self.num_hidden_layers,
@@ -370,7 +375,7 @@ class GPTModelTester(object):
        outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
        loss, lm_logits = outputs[:2]
-        total_voc = self.n_special + self.vocab_size
+        total_voc = self.vocab_size
        self.parent.assertListEqual(
            list(lm_logits.size()),
            [self.batch_size, self.n_choices, self.seq_length, total_voc])
@@ -400,7 +405,7 @@ class GPTModelTester(object):
        lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
        loss = [lm_loss, mc_loss]
-        total_voc = self.n_special + self.vocab_size
+        total_voc = self.vocab_size
        self.parent.assertListEqual(
            list(lm_logits.size()),
            [self.batch_size, self.n_choices, self.seq_length, total_voc])
@@ -441,6 +446,30 @@ class GPTModelTester(object):
        self.create_and_check_commons(*config_and_inputs)
    def run_slow_tests(self):
-        config_and_inputs = self.prepare_config_and_inputs()
+        self.create_and_check_model_from_pretrained()
        self.create_and_check_model_from_pretrained(*config_and_inputs)
 class ModelUtilsTest(unittest.TestCase):
    def test_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            config = BertConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, PretrainedConfig)
            model = BertModel.from_pretrained(model_name)
            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, PreTrainedModel)
            for value in loading_info.values():
                self.assertEqual(len(value), 0)
            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
            self.assertEqual(model.config.output_attentions, True)
            self.assertEqual(model.config.output_hidden_states, True)
            self.assertEqual(model.config, config)
 if __name__ == "__main__":
    unittest.main()
--- a/pytorch_transformers/tests/modeling_gpt2_test.py
+++ b/pytorch_transformers/tests/modeling_gpt2_test.py
@@ -28,7 +28,7 @@ import torch
 from pytorch_transformers import (GPT2Config, GPT2Model,
                                     GPT2LMHeadModel, GPT2DoubleHeadsModel)
-from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
+from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester)
 class GPT2ModelTest(unittest.TestCase):
--- a/pytorch_transformers/tests/modeling_openai_test.py
+++ b/pytorch_transformers/tests/modeling_openai_test.py
@@ -24,7 +24,7 @@ import torch
 from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
                                     OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
-from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
+from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester)
 class OpenAIModelTest(unittest.TestCase):
--- a/pytorch_transformers/tests/modeling_transfo_xl_test.py
+++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py
@@ -28,7 +28,7 @@ import torch
 from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
 from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
+from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor
 class TransfoXLModelTest(unittest.TestCase):
    class TransfoXLModelTester(object):
--- a/pytorch_transformers/tests/modeling_utils_test.py
+++ b/pytorch_transformers/tests/modeling_utils_test.py
@@ -1,47 +0,0 @@
 # coding=utf-8
 # Copyright 2018 HuggingFace Inc..
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import unittest
 import logging
 from pytorch_transformers import PretrainedConfig, PreTrainedModel
 from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 class ModelUtilsTest(unittest.TestCase):
    def test_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            config = BertConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, PretrainedConfig)
            model = BertModel.from_pretrained(model_name)
            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, PreTrainedModel)
            for value in loading_info.values():
                self.assertEqual(len(value), 0)
            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
            self.assertEqual(model.config.output_attentions, True)
            self.assertEqual(model.config.output_hidden_states, True)
            self.assertEqual(model.config, config)
 if __name__ == "__main__":
    unittest.main()
--- a/pytorch_transformers/tests/modeling_xlm_test.py
+++ b/pytorch_transformers/tests/modeling_xlm_test.py
@@ -23,7 +23,7 @@ import pytest
 from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
 from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
+from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor)
 class XLMModelTest(unittest.TestCase):
--- a/pytorch_transformers/tests/modeling_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -28,7 +28,7 @@ import torch
 from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
 from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
+from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor
 class XLNetModelTest(unittest.TestCase):
    class XLNetModelTester(object):