From bd404735a7f282a41b11e240eb7c880e329567c3 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 12 Jul 2019 00:02:49 +0200
Subject: [PATCH] embeddings resizing + tie_weights

---
 pytorch_transformers/modeling_bert.py         |  53 +++++--
 pytorch_transformers/modeling_gpt2.py         | 148 +++--------------
 pytorch_transformers/modeling_openai.py       | 150 ++++--------------
 pytorch_transformers/modeling_transfo_xl.py   |   7 +
 pytorch_transformers/modeling_utils.py        |  41 ++++-
 pytorch_transformers/modeling_xlm.py          |  13 +-
 pytorch_transformers/modeling_xlnet.py        |  12 +-
 .../tests/modeling_bert_test.py               |   2 +-
 ...sts_commons.py => modeling_common_test.py} |  45 +++++-
 .../tests/modeling_gpt2_test.py               |   2 +-
 .../tests/modeling_openai_test.py             |   2 +-
 .../tests/modeling_transfo_xl_test.py         |   2 +-
 .../tests/modeling_utils_test.py              |  47 ------
 .../tests/modeling_xlm_test.py                |   2 +-
 .../tests/modeling_xlnet_test.py              |   2 +-
 15 files changed, 196 insertions(+), 332 deletions(-)
 rename pytorch_transformers/tests/{modeling_tests_commons.py => modeling_common_test.py} (91%)
 delete mode 100644 pytorch_transformers/tests/modeling_utils_test.py

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 6da6a5e507..d88c57bb79 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -507,23 +507,17 @@ class BertPredictionHeadTransform(nn.Module):
 
 
 class BertLMPredictionHead(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
+    def __init__(self, config):
         super(BertLMPredictionHead, self).__init__()
         self.transform = BertPredictionHeadTransform(config)
-        self.torchscript = config.torchscript
 
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
-        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
-                                 bert_model_embedding_weights.size(0),
+        self.decoder = nn.Linear(config.hidden_size,
+                                 config.vocab_size,
                                  bias=False)
 
-        if self.torchscript:
-            self.decoder.weight = nn.Parameter(bert_model_embedding_weights.clone())
-        else:
-            self.decoder.weight = bert_model_embedding_weights
-
-        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
 
     def forward(self, hidden_states):
         hidden_states = self.transform(hidden_states)
@@ -532,9 +526,9 @@ class BertLMPredictionHead(nn.Module):
 
 
 class BertOnlyMLMHead(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
+    def __init__(self, config):
         super(BertOnlyMLMHead, self).__init__()
-        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+        self.predictions = BertLMPredictionHead(config)
 
     def forward(self, sequence_output):
         prediction_scores = self.predictions(sequence_output)
@@ -552,9 +546,9 @@ class BertOnlyNSPHead(nn.Module):
 
 
 class BertPreTrainingHeads(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
+    def __init__(self, config):
         super(BertPreTrainingHeads, self).__init__()
-        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+        self.predictions = BertLMPredictionHead(config)
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
 
     def forward(self, sequence_output, pooled_output):
@@ -619,6 +613,11 @@ class BertModel(BertPreTrainedModel):
 
         self.apply(self.init_weights)
 
+    def _resize_token_embeddings(self, new_num_tokens):
+        old_embeddings = self.embeddings.word_embeddings
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        self.embeddings.word_embeddings = new_embeddings
+
     def _prune_heads(self, heads_to_prune):
         """ Prunes heads of the model.
             heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
@@ -750,9 +749,20 @@ class BertForPreTraining(BertPreTrainedModel):
         super(BertForPreTraining, self).__init__(config)
 
         self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
+        self.cls = BertPreTrainingHeads(config)
 
         self.apply(self.init_weights)
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        input_embeddings = self.bert.embeddings.word_embeddings.weight
+        if self.config.torchscript:
+            self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
+        else:
+            self.cls.predictions.decoder.weight = input_embeddings  # Tied weights
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
                 next_sentence_label=None, head_mask=None):
@@ -845,9 +855,20 @@ class BertForMaskedLM(BertPreTrainedModel):
         super(BertForMaskedLM, self).__init__(config)
 
         self.bert = BertModel(config)
-        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
+        self.cls = BertOnlyMLMHead(config)
 
         self.apply(self.init_weights)
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        input_embeddings = self.bert.embeddings.word_embeddings.weight
+        if self.config.torchscript:
+            self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
+        else:
+            self.cls.predictions.decoder.weight = input_embeddings  # Tied weights
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
         """
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 495e002529..06f933147f 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -104,7 +104,6 @@ class GPT2Config(PretrainedConfig):
 
     Args:
         vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
-        n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
         n_positions: Number of positional embeddings.
         n_ctx: Size of the causal mask (usually same as n_positions).
         n_embd: Dimensionality of the embeddings and hidden states.
@@ -119,14 +118,12 @@ class GPT2Config(PretrainedConfig):
         embd_pdrop: The dropout ratio for the embeddings.
         initializer_range: The sttdev of the truncated_normal_initializer for
             initializing all weight matrices.
-        predict_special_tokens: should we predict special tokens (when the model has a LM head)
     """
     pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(
         self,
         vocab_size_or_config_json_file=50257,
-        n_special=0,
         n_positions=1024,
         n_ctx=1024,
         n_embd=768,
@@ -137,7 +134,6 @@ class GPT2Config(PretrainedConfig):
         attn_pdrop=0.1,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
-        predict_special_tokens=True,
 
         num_labels=1,
         summary_type='token_ids',
@@ -151,7 +147,6 @@ class GPT2Config(PretrainedConfig):
 
         Args:
             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
-            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
             n_positions: Number of positional embeddings.
             n_ctx: Size of the causal mask (usually same as n_positions).
             n_embd: Dimensionality of the embeddings and hidden states.
@@ -166,7 +161,6 @@ class GPT2Config(PretrainedConfig):
             embd_pdrop: The dropout ratio for the embeddings.
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
-            predict_special_tokens: should we predict special tokens (when the model has a LM head)
         """
         super(GPT2Config, self).__init__(**kwargs)
 
@@ -178,7 +172,6 @@ class GPT2Config(PretrainedConfig):
                 self.__dict__[key] = value
         elif isinstance(vocab_size_or_config_json_file, int):
             self.vocab_size = vocab_size_or_config_json_file
-            self.n_special = n_special
             self.n_ctx = n_ctx
             self.n_positions = n_positions
             self.n_embd = n_embd
@@ -189,7 +182,6 @@ class GPT2Config(PretrainedConfig):
             self.attn_pdrop = attn_pdrop
             self.layer_norm_epsilon = layer_norm_epsilon
             self.initializer_range = initializer_range
-            self.predict_special_tokens = predict_special_tokens
 
             self.num_labels = num_labels
             self.summary_type = summary_type
@@ -203,10 +195,6 @@ class GPT2Config(PretrainedConfig):
                 "or the path to a pretrained model config file (str)"
             )
 
-    @property
-    def total_tokens_embeddings(self):
-        return self.vocab_size + self.n_special
-
     @property
     def hidden_size(self):
         return self.n_embd
@@ -347,34 +335,6 @@ class Block(nn.Module):
         return outputs  # x, present, (attentions)
 
 
-class GPT2LMHead(nn.Module):
-    """ Language Model Head for the transformer """
-
-    def __init__(self, model_embeddings_weights, config):
-        super(GPT2LMHead, self).__init__()
-        self.n_embd = config.n_embd
-        self.vocab_size = config.vocab_size
-        self.predict_special_tokens = config.predict_special_tokens
-        self.torchscript = config.torchscript
-        embed_shape = model_embeddings_weights.shape
-        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
-        self.set_embeddings_weights(model_embeddings_weights)
-
-    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
-        self.predict_special_tokens = predict_special_tokens
-        # Export to TorchScript can't handle parameter sharing so we are cloning them.
-        if self.torchscript:
-            self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())
-        else:
-            self.decoder.weight = model_embeddings_weights  # Tied weights
-
-    def forward(self, hidden_state):
-        lm_logits = self.decoder(hidden_state)
-        if not self.predict_special_tokens:
-            lm_logits = lm_logits[..., :self.vocab_size]
-        return lm_logits
-
-
 class GPT2PreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
@@ -400,36 +360,6 @@ class GPT2PreTrainedModel(PreTrainedModel):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        """
-        Instantiate a GPT2PreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-
-        Params:
-            pretrained_model_name_or_path: either:
-                - a str with the name of a pre-trained model to load selected in the list of:
-                    . `gpt2`
-                - a path or url to a pretrained model archive containing:
-                    . `gpt2_config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
-                - a path or url to a pretrained model archive containing:
-                    . `gpt2_config.json` a configuration file for the model
-                    . a TensorFlow checkpoint with trained weights
-            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific GPT2 class
-        """
-        num_special_tokens = kwargs.pop('num_special_tokens', None)
-
-        model = super().from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-
-        # Add additional embeddings for special tokens if needed
-        # This step also make sure we are still sharing the output and input embeddings after loading weights
-        model.set_num_special_tokens(num_special_tokens)
-        return model
-
 
 class GPT2Model(GPT2PreTrainedModel):
     """OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners").
@@ -447,13 +377,13 @@ class GPT2Model(GPT2PreTrainedModel):
          config.vocab_size - 1,                                     ______________________
          config.vocab_size,
          ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
+         config.vocab_size + n_special - 1]                  ______________________
 
-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is equal to
+    where total_tokens_embeddings is equal to
 
     ::
 
-        total_tokens_embeddings = config.vocab_size + config.n_special
+        total_tokens_embeddings = vocab_size + n_special
 
     You should use the associated indices to index the embeddings.
 
@@ -474,7 +404,7 @@ class GPT2Model(GPT2PreTrainedModel):
         self.output_hidden_states = config.output_hidden_states
         self.output_attentions = config.output_attentions
 
-        self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
         self.wpe = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
         self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
@@ -482,26 +412,8 @@ class GPT2Model(GPT2PreTrainedModel):
 
         self.apply(self.init_weights)
 
-    def set_num_special_tokens(self, num_special_tokens=None):
-        """
-        Update input embeddings with new embedding matrix if needed.
-
-        Args:
-            num_special_tokens: Special tokens to be added to the embedding matrix
-
-        TODO Lysandre filled args
-        """
-        if num_special_tokens is None or self.config.n_special == num_special_tokens:
-            return
-        # Update config
-        self.config.n_special = num_special_tokens
-        # Build new embeddings and initialize all new embeddings (in particular the special tokens)
-        old_embed = self.wte
-        self.wte = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
-        self.wte.to(old_embed.weight.device)
-        self.init_weights(self.wte)
-        # Copy word embeddings from the previous weights
-        self.wte.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
+    def _resize_token_embeddings(self, new_num_tokens):
+        self.wte = self._get_resized_embeddings(self.wte, new_num_tokens)
 
     def _prune_heads(self, heads_to_prune):
         """ Prunes heads of the model.
@@ -641,23 +553,20 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
     def __init__(self, config):
         super(GPT2LMHeadModel, self).__init__(config)
         self.transformer = GPT2Model(config)
-        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+
         self.apply(self.init_weights)
+        self.tie_weights()
 
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
         """
-        Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
-
-        Args:
-            num_special_tokens: Special tokens to be added to the embedding matrix
-            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
-                Defaults to True.
-
-        TODO Lysandre filled args
-        """
-        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
-        self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
+        input_embeddings = self.transformer.wte.weight
+        if self.config.torchscript:
+            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
+        else:
+            self.lm_head.weight = input_embeddings  # Tied weights
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
         """
@@ -740,25 +649,20 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     def __init__(self, config):
         super(GPT2DoubleHeadsModel, self).__init__(config)
         self.transformer = GPT2Model(config)
-        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         self.multiple_choice_head = SequenceSummary(config)
 
         self.apply(self.init_weights)
 
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
         """
-        Update input and output embeddings with new embedding matrix.Make sure we are sharing the embeddings
-
-        Args:
-            num_special_tokens: Special tokens to be added to the embedding matrix
-            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
-                Defaults to True.
-
-        TODO Lysandre filled args
-        """
-        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
-        self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
+        input_embeddings = self.transformer.wte.weight
+        if self.config.torchscript:
+            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
+        else:
+            self.lm_head.weight = input_embeddings  # Tied weights
 
     def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                 position_ids=None, past=None, head_mask=None):
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index aa35b163f1..ebf1035d21 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -156,7 +156,6 @@ class OpenAIGPTConfig(PretrainedConfig):
     def __init__(
         self,
         vocab_size_or_config_json_file=40478,
-        n_special=0,
         n_positions=512,
         n_ctx=512,
         n_embd=768,
@@ -190,7 +189,6 @@ class OpenAIGPTConfig(PretrainedConfig):
                 self.__dict__[key] = value
         elif isinstance(vocab_size_or_config_json_file, int):
             self.vocab_size = vocab_size_or_config_json_file
-            self.n_special = n_special
             self.n_ctx = n_ctx
             self.n_positions = n_positions
             self.n_embd = n_embd
@@ -216,10 +214,6 @@ class OpenAIGPTConfig(PretrainedConfig):
                 "or the path to a pretrained model config file (str)"
             )
 
-    @property
-    def total_tokens_embeddings(self):
-        return self.vocab_size + self.n_special
-
     @property
     def hidden_size(self):
         return self.n_embd
@@ -355,34 +349,6 @@ class Block(nn.Module):
         return outputs
 
 
-class OpenAIGPTLMHead(nn.Module):
-    """ Language Model Head for the transformer """
-
-    def __init__(self, model_embeddings_weights, config):
-        super(OpenAIGPTLMHead, self).__init__()
-        self.n_embd = config.n_embd
-        self.vocab_size = config.vocab_size
-        self.predict_special_tokens = config.predict_special_tokens
-        self.torchscript = config.torchscript
-        embed_shape = model_embeddings_weights.shape
-        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
-        self.set_embeddings_weights(model_embeddings_weights)
-
-    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
-        self.predict_special_tokens = predict_special_tokens
-
-        if self.torchscript:
-            self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())
-        else:
-            self.decoder.weight = model_embeddings_weights  # Tied weights
-
-    def forward(self, hidden_state):
-        lm_logits = self.decoder(hidden_state)
-        if not self.predict_special_tokens:
-            lm_logits = lm_logits[..., :self.vocab_size]
-        return lm_logits
-
-
 class OpenAIGPTPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
@@ -408,36 +374,6 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        """
-        Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-
-        Params:
-            pretrained_model_name_or_path: either:
-                - a str with the name of a pre-trained model to load selected in the list of:
-                - a path or url to a pretrained model archive containing:
-                    . `config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
-                - a path or url to a pretrained model archive containing:
-                    . `config.json` a configuration file for the model
-                    . a series of NumPy files containing OpenAI TensorFlow trained weights
-            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific OpenAI-GPT class
-        """
-        num_special_tokens = kwargs.get('num_special_tokens', None)
-        kwargs.pop('num_special_tokens', None)
-
-        model = super(PreTrainedModel, cls).from_pretrained(pretrained_model_name_or_path, pretrained_model_name_or_path, *inputs, **kwargs)
-
-        # Add additional embeddings for special tokens if needed
-        # This step also make sure we are still sharing the output and input embeddings after loading weights
-        model.set_num_special_tokens(num_special_tokens)
-        return model
-
 
 class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
     """OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
@@ -457,13 +393,13 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
          config.vocab_size - 1,                                     ______________________
          config.vocab_size,
          ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
+         config.vocab_size + n_special - 1]                  ______________________
 
-    where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
+    where ``total_tokens_embeddings``  is:
 
     ::
 
-        total_tokens_embeddings = config.vocab_size + config.n_special
+        total_tokens_embeddings = config.vocab_size + n_special
 
     You should use the associated indices to index the embeddings.
 
@@ -485,34 +421,15 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
 
-        self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
+        self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd)
         self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
         self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
 
         self.apply(self.init_weights)
 
-    def set_num_special_tokens(self, num_special_tokens=None):
-        """
-        Update input embeddings with new embedding matrice if needed
-
-        Args:
-            num_special_tokens: Special tokens to be added to the embedding matrix
-
-        TODO Lysandre filled Args
-
-        """
-        if num_special_tokens is None or self.config.n_special == num_special_tokens:
-            return
-        # Update config
-        self.config.n_special = num_special_tokens
-        # Build new embeddings and initialize all new embeddings (in particular the special tokens)
-        old_embed = self.tokens_embed
-        self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
-        self.tokens_embed.to(old_embed.weight.device)
-        self.init_weights(self.tokens_embed)
-        # Copy word embeddings from the previous weights
-        self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
+    def _resize_token_embeddings(self, new_num_tokens):
+        self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens)
 
     def _prune_heads(self, heads_to_prune):
         """ Prunes heads of the model.
@@ -657,24 +574,20 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     def __init__(self, config):
         super(OpenAIGPTLMHeadModel, self).__init__(config)
         self.transformer = OpenAIGPTModel(config)
-        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+
         self.apply(self.init_weights)
+        self.tie_weights()
 
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
         """
-        Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings
-
-        Args:
-            num_special_tokens: Special tokens to be added to the embedding matrix
-            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
-                Defaults to True.
-
-        TODO Lysandre filled Args
-
-        """
-        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
-        self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
+        input_embeddings = self.transformer.tokens_embed.weight
+        if self.config.torchscript:
+            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
+        else:
+            self.lm_head.weight = input_embeddings  # Tied weights
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
         """
@@ -747,13 +660,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
          config.vocab_size - 1,                                     ______________________
          config.vocab_size,
          ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
+         config.vocab_size + n_special - 1]                  ______________________
 
-    where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
+    where ``total_tokens_embeddings`` is:
 
     ::
 
-        total_tokens_embeddings = config.vocab_size + config.n_special
+        total_tokens_embeddings = config.vocab_size + .n_special
 
     You should use the associate indices to index the embeddings.
 
@@ -773,24 +686,21 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
 
         self.transformer = OpenAIGPTModel(config)
-        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         self.multiple_choice_head = SequenceSummary(config)
 
         self.apply(self.init_weights)
+        self.tie_weights()
 
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
-        """ Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
-
-        Args:
-            num_special_tokens: Special tokens to be added to the embedding matrix
-            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
-                Defaults to True.
-
-        TODO Lysandre filled Args
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
         """
-        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
-        self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
+        input_embeddings = self.transformer.tokens_embed.weight
+        if self.config.torchscript:
+            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
+        else:
+            self.lm_head.weight = input_embeddings  # Tied weights
 
     def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                 position_ids=None, head_mask=None):
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index b194f43b68..71f80a9eea 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -287,6 +287,10 @@ class TransfoXLConfig(PretrainedConfig):
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
 
+    @property
+    def vocab_size(self):
+        return self.n_token
+
     @property
     def hidden_size(self):
         return self.d_model
@@ -998,6 +1002,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
 
         self.apply(self.init_weights)
 
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError
+
     def backward_compatible(self):
         self.sample_softmax = -1
 
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index a9445ecad5..8fdfda4720 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -151,6 +151,7 @@ class PreTrainedModel(nn.Module):
     pretrained_model_archive_map = {}
     load_tf_weights = lambda model, config, path: None
     base_model_prefix = ""
+    input_embeddings = None
 
     def __init__(self, config, *inputs, **kwargs):
         super(PreTrainedModel, self).__init__()
@@ -164,12 +165,48 @@ class PreTrainedModel(nn.Module):
         # Save config in model
         self.config = config
 
+    def _get_resized_embeddings(self, old_embeddings, new_num_tokens):
+        # Build new embeddings
+        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+        new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
+        new_embeddings.to(old_embeddings.weight.device)
+
+        # initialize all new embeddings (in particular added tokens)
+        self.init_weights(new_embeddings)
+
+        # Copy word embeddings from the previous weights
+        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
+        new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
+
+        return new_embeddings
+
+    def resize_token_embeddings(self, new_num_tokens):
+        """ Resize input token embeddings matrix.
+
+        Args:
+            new_num_tokens: New number of tokens in the embedding matrix.
+                Increasing the size will add newly initialized vectors at the end
+                Reducing the size will remove vectors from the end
+        """
+        if new_num_tokens == self.config.vocab_size:
+            return
+        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
+        base_model._resize_token_embeddings(new_num_tokens)
+
+        # Update base model and current model config
+        self.config.vocab_size = new_num_tokens
+        base_model.vocab_size = new_num_tokens
+
+        # Tie weights again if needed
+        if hasattr(self, 'tie_weights'):
+            self.tie_weights()
+
     def prune_heads(self, heads_to_prune):
         """ Prunes heads of the base model.
             heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
         """
-        model_to_prune = getattr(self, self.base_model_prefix, self)  # get the base model if needed
-        model_to_prune._prune_heads(heads_to_prune)
+        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
+        base_model._prune_heads(heads_to_prune)
 
     def save_pretrained(self, save_directory):
         """ Save a model with its configuration file to a directory, so that it
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 7567a0f24b..3d5b35fae6 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -104,7 +104,6 @@ class XLMConfig(PretrainedConfig):
 
     def __init__(self,
                  vocab_size_or_config_json_file=30145,
-                 n_special=0,
                  emb_dim=2048,
                  n_layers=12,
                  n_heads=16,
@@ -148,7 +147,6 @@ class XLMConfig(PretrainedConfig):
                 self.__dict__[key] = value
         elif isinstance(vocab_size_or_config_json_file, int):
             self.n_words = vocab_size_or_config_json_file
-            self.n_special = n_special
             self.emb_dim = emb_dim
             self.n_layers = n_layers
             self.n_heads = n_heads
@@ -183,8 +181,8 @@ class XLMConfig(PretrainedConfig):
                              "or the path to a pretrained model config file (str)")
 
     @property
-    def total_tokens_embeddings(self):
-        return self.n_words + self.n_special
+    def vocab_size(self):
+        return self.n_words
 
     @property
     def hidden_size(self):
@@ -479,6 +477,9 @@ class XLMModel(XLMPreTrainedModel):
 
         self.apply(self.init_weights)
 
+    def _resize_token_embeddings(self, new_num_tokens):
+        self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
+
     def _prune_heads(self, heads_to_prune):
         """ Prunes heads of the model.
             heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
@@ -718,8 +719,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
     """
     def __init__(self, config):
         super(XLMWithLMHeadModel, self).__init__(config)
-        self.torchscript = config.torchscript
-
         self.transformer = XLMModel(config)
         self.pred_layer = XLMPredLayer(config)
 
@@ -729,7 +728,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
     def tie_weights(self):
         """ Make sure we are sharing the embeddings
         """
-        if self.torchscript:
+        if self.config.torchscript:
             self.pred_layer.proj.weight = nn.Parameter(self.transformer.embeddings.weight.clone())
         else:
             self.pred_layer.proj.weight = self.transformer.embeddings.weight
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index a5f95957c3..36c068e3a3 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -312,6 +312,10 @@ class XLNetConfig(PretrainedConfig):
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
 
+    @property
+    def vocab_size(self):
+        return self.n_token
+
     @property
     def hidden_size(self):
         return self.d_model
@@ -654,6 +658,9 @@ class XLNetModel(XLNetPreTrainedModel):
 
         self.apply(self.init_weights)
 
+    def _resize_token_embeddings(self, new_num_tokens):
+        self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
+
     def _prune_heads(self, heads_to_prune):
         logger.info("Head pruning is not implemented for XLNet")
         pass
@@ -970,20 +977,17 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         super(XLNetLMHeadModel, self).__init__(config)
         self.attn_type = config.attn_type
         self.same_length = config.same_length
-        self.torchscript = config.torchscript
 
         self.transformer = XLNetModel(config)
         self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
 
-        # Tie weights
-
         self.apply(self.init_weights)
         self.tie_weights()
 
     def tie_weights(self):
         """ Make sure we are sharing the embeddings
         """
-        if self.torchscript:
+        if self.config.torchscript:
             self.lm_loss.weight = nn.Parameter(self.transformer.word_embedding.weight.clone())
         else:
             self.lm_loss.weight = self.transformer.word_embedding.weight
diff --git a/pytorch_transformers/tests/modeling_bert_test.py b/pytorch_transformers/tests/modeling_bert_test.py
index fbdce29366..4ab0c9d157 100644
--- a/pytorch_transformers/tests/modeling_bert_test.py
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -26,7 +26,7 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
                                      BertForTokenClassification, BertForMultipleChoice)
 from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
+from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor)
 
 
 class BertModelTest(unittest.TestCase):
diff --git a/pytorch_transformers/tests/modeling_tests_commons.py b/pytorch_transformers/tests/modeling_common_test.py
similarity index 91%
rename from pytorch_transformers/tests/modeling_tests_commons.py
rename to pytorch_transformers/tests/modeling_common_test.py
index 5535177aaa..98849216fa 100644
--- a/pytorch_transformers/tests/modeling_tests_commons.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -22,8 +22,15 @@ import shutil
 import json
 import random
 
+import unittest
+import logging
+
 import torch
 
+from pytorch_transformers import PretrainedConfig, PreTrainedModel
+from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
     for key in configs_no_init.__dict__.keys():
@@ -242,6 +249,7 @@ class ConfigTester(object):
 
     def create_and_test_config_common_properties(self):
         config = self.config_class(**self.inputs_dict)
+        self.parent.assertTrue(hasattr(config, 'vocab_size'))
         self.parent.assertTrue(hasattr(config, 'hidden_size'))
         self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
         self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
@@ -276,7 +284,6 @@ class GPTModelTester(object):
                     use_token_type_ids=True,
                     use_labels=True,
                     vocab_size=99,
-                    n_special=1,
                     n_positions=33,
                     hidden_size=32,
                     num_hidden_layers=5,
@@ -299,7 +306,6 @@ class GPTModelTester(object):
         self.use_token_type_ids = use_token_type_ids
         self.use_labels = use_labels
         self.vocab_size = vocab_size
-        self.n_special = n_special
         self.n_positions = n_positions
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
@@ -316,7 +322,7 @@ class GPTModelTester(object):
         self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class)
 
     def prepare_config_and_inputs(self):
-        total_num_tokens = self.vocab_size + self.n_special
+        total_num_tokens = self.vocab_size
         input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
 
         position_ids = None
@@ -338,7 +344,6 @@ class GPTModelTester(object):
 
         config = self.config_class(
             vocab_size_or_config_json_file=self.vocab_size,
-            n_special=self.n_special,
             n_positions=self.n_positions,
             n_embd=self.hidden_size,
             n_layer=self.num_hidden_layers,
@@ -370,7 +375,7 @@ class GPTModelTester(object):
         outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
         loss, lm_logits = outputs[:2]
 
-        total_voc = self.n_special + self.vocab_size
+        total_voc = self.vocab_size
         self.parent.assertListEqual(
             list(lm_logits.size()),
             [self.batch_size, self.n_choices, self.seq_length, total_voc])
@@ -400,7 +405,7 @@ class GPTModelTester(object):
         lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
         loss = [lm_loss, mc_loss]
 
-        total_voc = self.n_special + self.vocab_size
+        total_voc = self.vocab_size
         self.parent.assertListEqual(
             list(lm_logits.size()),
             [self.batch_size, self.n_choices, self.seq_length, total_voc])
@@ -441,6 +446,30 @@ class GPTModelTester(object):
         self.create_and_check_commons(*config_and_inputs)
 
     def run_slow_tests(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        self.create_and_check_model_from_pretrained(*config_and_inputs)
+        self.create_and_check_model_from_pretrained()
 
+
+class ModelUtilsTest(unittest.TestCase):
+    def test_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = BertConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, PretrainedConfig)
+
+            model = BertModel.from_pretrained(model_name)
+            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, PreTrainedModel)
+            for value in loading_info.values():
+                self.assertEqual(len(value), 0)
+
+            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            self.assertEqual(model.config.output_attentions, True)
+            self.assertEqual(model.config.output_hidden_states, True)
+            self.assertEqual(model.config, config)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/tests/modeling_gpt2_test.py b/pytorch_transformers/tests/modeling_gpt2_test.py
index 7400c9f64d..00a9cb4614 100644
--- a/pytorch_transformers/tests/modeling_gpt2_test.py
+++ b/pytorch_transformers/tests/modeling_gpt2_test.py
@@ -28,7 +28,7 @@ import torch
 from pytorch_transformers import (GPT2Config, GPT2Model,
                                      GPT2LMHeadModel, GPT2DoubleHeadsModel)
 
-from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
+from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester)
 
 class GPT2ModelTest(unittest.TestCase):
 
diff --git a/pytorch_transformers/tests/modeling_openai_test.py b/pytorch_transformers/tests/modeling_openai_test.py
index 27263ecb24..4f57f4661b 100644
--- a/pytorch_transformers/tests/modeling_openai_test.py
+++ b/pytorch_transformers/tests/modeling_openai_test.py
@@ -24,7 +24,7 @@ import torch
 from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
                                      OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
 
-from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
+from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester)
 
 class OpenAIModelTest(unittest.TestCase):
 
diff --git a/pytorch_transformers/tests/modeling_transfo_xl_test.py b/pytorch_transformers/tests/modeling_transfo_xl_test.py
index 49ba1addf1..9631cd6034 100644
--- a/pytorch_transformers/tests/modeling_transfo_xl_test.py
+++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py
@@ -28,7 +28,7 @@ import torch
 from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
 from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
+from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor
 
 class TransfoXLModelTest(unittest.TestCase):
     class TransfoXLModelTester(object):
diff --git a/pytorch_transformers/tests/modeling_utils_test.py b/pytorch_transformers/tests/modeling_utils_test.py
deleted file mode 100644
index 4944f41228..0000000000
--- a/pytorch_transformers/tests/modeling_utils_test.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# coding=utf-8
-# Copyright 2018 HuggingFace Inc..
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-import logging
-
-from pytorch_transformers import PretrainedConfig, PreTrainedModel
-from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-class ModelUtilsTest(unittest.TestCase):
-    def test_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
-        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            config = BertConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, PretrainedConfig)
-
-            model = BertModel.from_pretrained(model_name)
-            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, PreTrainedModel)
-            for value in loading_info.values():
-                self.assertEqual(len(value), 0)
-
-            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-            self.assertEqual(model.config.output_attentions, True)
-            self.assertEqual(model.config.output_hidden_states, True)
-            self.assertEqual(model.config, config)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/pytorch_transformers/tests/modeling_xlm_test.py b/pytorch_transformers/tests/modeling_xlm_test.py
index 6e2e082d19..9d6bc4054d 100644
--- a/pytorch_transformers/tests/modeling_xlm_test.py
+++ b/pytorch_transformers/tests/modeling_xlm_test.py
@@ -23,7 +23,7 @@ import pytest
 from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
 from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
+from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor)
 
 
 class XLMModelTest(unittest.TestCase):
diff --git a/pytorch_transformers/tests/modeling_xlnet_test.py b/pytorch_transformers/tests/modeling_xlnet_test.py
index e167e2d2e8..41c114ce9c 100644
--- a/pytorch_transformers/tests/modeling_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -28,7 +28,7 @@ import torch
 from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
 from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
+from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor
 
 class XLNetModelTest(unittest.TestCase):
     class XLNetModelTester(object):