updating tests

2019-07-12 10:57:58 +02:00
parent 3fbceed8d2
commit 2918b7d2a0
14 changed files with 672 additions and 596 deletions
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -617,6 +617,7 @@ class BertModel(BertPreTrainedModel):
        old_embeddings = self.embeddings.word_embeddings
        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
        self.embeddings.word_embeddings = new_embeddings
        return self.embeddings.word_embeddings
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
@@ -758,11 +759,8 @@ class BertForPreTraining(BertPreTrainedModel):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
-        input_embeddings = self.bert.embeddings.word_embeddings.weight
+        self._tie_or_clone_weights(self.cls.predictions.decoder,
-        if self.config.torchscript:
+                                   self.bert.embeddings.word_embeddings)
            self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
        else:
            self.cls.predictions.decoder.weight = input_embeddings  # Tied weights
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
                next_sentence_label=None, head_mask=None):
@@ -864,11 +862,8 @@ class BertForMaskedLM(BertPreTrainedModel):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
-        input_embeddings = self.bert.embeddings.word_embeddings.weight
+        self._tie_or_clone_weights(self.cls.predictions.decoder,
-        if self.config.torchscript:
+                                   self.bert.embeddings.word_embeddings)
            self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
        else:
            self.cls.predictions.decoder.weight = input_embeddings  # Tied weights
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
        """
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -414,6 +414,7 @@ class GPT2Model(GPT2PreTrainedModel):
    def _resize_token_embeddings(self, new_num_tokens):
        self.wte = self._get_resized_embeddings(self.wte, new_num_tokens)
        return self.wte
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
@@ -562,11 +563,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
-        input_embeddings = self.transformer.wte.weight
+        self._tie_or_clone_weights(self.lm_head,
-        if self.config.torchscript:
+                                   self.transformer.wte)
            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
        else:
            self.lm_head.weight = input_embeddings  # Tied weights
    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
        """
@@ -658,11 +656,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
-        input_embeddings = self.transformer.wte.weight
+        self._tie_or_clone_weights(self.lm_head,
-        if self.config.torchscript:
+                                   self.transformer.wte)
            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
        else:
            self.lm_head.weight = input_embeddings  # Tied weights
    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                position_ids=None, past=None, head_mask=None):
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -430,6 +430,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
    def _resize_token_embeddings(self, new_num_tokens):
        self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens)
        return self.tokens_embed
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
@@ -583,11 +584,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
-        input_embeddings = self.transformer.tokens_embed.weight
+        self._tie_or_clone_weights(self.lm_head,
-        if self.config.torchscript:
+                                   self.transformer.tokens_embed)
            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
        else:
            self.lm_head.weight = input_embeddings  # Tied weights
    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
        """
@@ -696,11 +694,8 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
-        input_embeddings = self.transformer.tokens_embed.weight
+        self._tie_or_clone_weights(self.lm_head,
-        if self.config.torchscript:
+                                   self.transformer.tokens_embed)
            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
        else:
            self.lm_head.weight = input_embeddings  # Tied weights
    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                position_ids=None, head_mask=None):
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -291,6 +291,10 @@ class TransfoXLConfig(PretrainedConfig):
    def vocab_size(self):
        return self.n_token
    @vocab_size.setter
    def vocab_size(self, value):
        self.n_token = value
    @property
    def hidden_size(self):
        return self.d_model
@@ -1003,7 +1007,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
        self.apply(self.init_weights)
    def _resize_token_embeddings(self, new_num_tokens):
-        raise NotImplementedError
+        return self.word_emb
    def backward_compatible(self):
        self.sample_softmax = -1
@@ -1280,12 +1284,19 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
        else:
            if self.config.tie_weight:
                for i in range(len(self.crit.out_layers)):
-                    self.crit.out_layers[i].weight = self.transformer.word_emb.emb_layers[i].weight
+                    self._tie_or_clone_weights(self.crit.out_layers[i],
                                               self.transformer.word_emb.emb_layers[i])
            if self.config.tie_projs:
                for i, tie_proj in enumerate(self.config.tie_projs):
                    if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
                        if self.config.torchscript:
                            self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[0].clone())
                        else:
                            self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
                    elif tie_proj and self.config.div_val != 1:
                        if self.config.torchscript:
                            self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[i].clone())
                        else:
                            self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
    def reset_length(self, tgt_len, ext_len, mem_len):
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -165,9 +165,27 @@ class PreTrainedModel(nn.Module):
        # Save config in model
        self.config = config
-    def _get_resized_embeddings(self, old_embeddings, new_num_tokens):
+    def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
-        # Build new embeddings
+        """ Build a resized Embedding Module from a provided token Embedding Module.
            Increasing the size will add newly initialized vectors at the end
            Reducing the size will remove vectors from the end
        Args:
            new_num_tokens: (Optional) New number of tokens in the embedding matrix.
                Increasing the size will add newly initialized vectors at the end
                Reducing the size will remove vectors from the end
                If not provided or None: return the provided token Embedding Module.
        Return:
            Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
        """
        if new_num_tokens is None:
            return old_embeddings
        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
        if old_num_tokens == new_num_tokens:
            return old_embeddings
        # Build new embeddings
        new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
        new_embeddings.to(old_embeddings.weight.device)
@@ -180,18 +198,29 @@ class PreTrainedModel(nn.Module):
        return new_embeddings
-    def resize_token_embeddings(self, new_num_tokens):
+    def _tie_or_clone_weights(self, first_module, second_module):
-        """ Resize input token embeddings matrix.
+        """ Tie or clone module weights depending of weither we are using TorchScript or not
        """
        if self.config.torchscript:
            first_module.weight = nn.Parameter(second_module.weight.clone())
        else:
            first_module.weight = second_module.weight
    def resize_token_embeddings(self, new_num_tokens=None):
        """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
        Args:
-            new_num_tokens: New number of tokens in the embedding matrix.
+            new_num_tokens: (Optional) New number of tokens in the embedding matrix.
                Increasing the size will add newly initialized vectors at the end
                Reducing the size will remove vectors from the end
                If not provided or None: does nothing.
        Return:
            Pointer to the input tokens Embedding Module of the model
        """
        if new_num_tokens == self.config.vocab_size:
            return
        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
-        base_model._resize_token_embeddings(new_num_tokens)
+        model_embeds = base_model._resize_token_embeddings(new_num_tokens)
        if new_num_tokens is None:
            return model_embeds
        # Update base model and current model config
        self.config.vocab_size = new_num_tokens
@@ -201,6 +230,8 @@ class PreTrainedModel(nn.Module):
        if hasattr(self, 'tie_weights'):
            self.tie_weights()
        return model_embeds
    def prune_heads(self, heads_to_prune):
        """ Prunes heads of the base model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -184,6 +184,10 @@ class XLMConfig(PretrainedConfig):
    def vocab_size(self):
        return self.n_words
    @vocab_size.setter
    def vocab_size(self, value):
        self.n_words = value
    @property
    def hidden_size(self):
        return self.emb_dim
@@ -479,6 +483,7 @@ class XLMModel(XLMPreTrainedModel):
    def _resize_token_embeddings(self, new_num_tokens):
        self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
        return self.embeddings
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
@@ -728,10 +733,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
    def tie_weights(self):
        """ Make sure we are sharing the embeddings
        """
-        if self.config.torchscript:
+        self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
            self.pred_layer.proj.weight = nn.Parameter(self.transformer.embeddings.weight.clone())
        else:
            self.pred_layer.proj.weight = self.transformer.embeddings.weight
    def forward(self, input_ids, lengths=None, positions=None, langs=None, token_type_ids=None,
                attention_mask=None, cache=None, labels=None, head_mask=None):
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -316,6 +316,10 @@ class XLNetConfig(PretrainedConfig):
    def vocab_size(self):
        return self.n_token
    @vocab_size.setter
    def vocab_size(self, value):
        self.n_token = value
    @property
    def hidden_size(self):
        return self.d_model
@@ -660,10 +664,10 @@ class XLNetModel(XLNetPreTrainedModel):
    def _resize_token_embeddings(self, new_num_tokens):
        self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
        return self.word_embedding
    def _prune_heads(self, heads_to_prune):
-        logger.info("Head pruning is not implemented for XLNet")
+        raise NotImplementedError
        pass
    def create_mask(self, qlen, mlen):
        """
@@ -987,10 +991,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
    def tie_weights(self):
        """ Make sure we are sharing the embeddings
        """
-        if self.config.torchscript:
+        self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding)
            self.lm_loss.weight = nn.Parameter(self.transformer.word_embedding.weight.clone())
        else:
            self.lm_loss.weight = self.transformer.word_embedding.weight
    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
--- a/pytorch_transformers/tests/modeling_bert_test.py
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -26,10 +26,15 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
                                     BertForTokenClassification, BertForMultipleChoice)
 from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor)
+from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
-class BertModelTest(unittest.TestCase):
+class BertModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
            BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
            BertForTokenClassification)
    class BertModelTester(object):
        def __init__(self,
@@ -55,9 +60,6 @@ class BertModelTest(unittest.TestCase):
                     num_labels=3,
                     num_choices=4,
                     scope=None,
                     all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
                             BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
                             BertForTokenClassification),
                    ):
            self.parent = parent
            self.batch_size = batch_size
@@ -81,7 +83,6 @@ class BertModelTest(unittest.TestCase):
            self.num_labels = num_labels
            self.num_choices = num_choices
            self.scope = scope
            self.all_model_classes = all_model_classes
        def prepare_config_and_inputs(self):
            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -253,16 +254,51 @@ class BertModelTest(unittest.TestCase):
            self.check_loss_output(result)
-        def create_and_check_bert_commons(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
            (config, input_ids, token_type_ids, input_mask,
             sequence_labels, token_labels, choice_labels) = config_and_inputs
            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
-            create_and_check_commons(self, config, inputs_dict)
+            return config, inputs_dict
-    def test_default(self):
+    def setUp(self):
-        self.run_tester(BertModelTest.BertModelTester(self))
+        self.model_tester = BertModelTest.BertModelTester(self)
        self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
    def test_config(self):
-        config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
+        self.config_tester.run_common_tests()
-        config_tester.run_common_tests()
+
    def test_bert_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_model(*config_and_inputs)
    def test_for_masked_lm(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
    def test_for_multiple_choice(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
    def test_for_next_sequence_prediction(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs)
    def test_for_pretraining(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_pretraining(*config_and_inputs)
    def test_for_question_answering(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_question_answering(*config_and_inputs)
    def test_for_sequence_classification(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_sequence_classification(*config_and_inputs)
    def test_for_token_classification(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
    @pytest.mark.slow
    def test_model_from_pretrained(self):
@@ -272,33 +308,5 @@ class BertModelTest(unittest.TestCase):
            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
    def run_tester(self, tester):
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_bert_model(*config_and_inputs)
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs)
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_bert_for_pretraining(*config_and_inputs)
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_bert_for_question_answering(*config_and_inputs)
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_bert_for_sequence_classification(*config_and_inputs)
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_bert_for_token_classification(*config_and_inputs)
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_bert_commons(*config_and_inputs)
 if __name__ == "__main__":
    unittest.main()
--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -39,18 +39,89 @@ def _config_zero_init(config):
            setattr(configs_no_init, key, 0.0)
    return configs_no_init
-def _create_and_check_torchscript_output_attentions(tester, model_classes, config, inputs_dict):
+class CommonTestCases:
    class CommonModelTester(unittest.TestCase):
        model_tester = None
        all_model_classes = ()
        test_torchscript = True
        test_pruning = True
        test_resize_embeddings = True
        def test_initialization(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            configs_no_init = _config_zero_init(config)
            for model_class in self.all_model_classes:
                model = model_class(config=configs_no_init)
                for name, param in model.named_parameters():
                    if param.requires_grad:
                        self.assertIn(param.data.mean().item(), [0.0, 1.0],
                        msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
        def test_attention_outputs(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            for model_class in self.all_model_classes:
                config.output_attentions = True
-    _create_and_check_torchscript(tester, model_classes, config, inputs_dict)
+                config.output_hidden_states = False
                model = model_class(config)
                model.eval()
                outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(model.config.output_attentions, True)
                self.assertEqual(model.config.output_hidden_states, False)
                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads,
                    self.model_tester.seq_length,
                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
                out_len = len(outputs)
-def _create_and_check_torchscript_output_hidden_state(tester, model_classes, config, inputs_dict):
+                # Check attention is always last and order is fine
                config.output_attentions = True
                config.output_hidden_states = True
-    _create_and_check_torchscript(tester, model_classes, config, inputs_dict)
+                model = model_class(config)
                model.eval()
                outputs = model(**inputs_dict)
                self.assertEqual(out_len+1, len(outputs))
                self.assertEqual(model.config.output_attentions, True)
                self.assertEqual(model.config.output_hidden_states, True)
                attentions = outputs[-1]
                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads,
                    self.model_tester.seq_length,
                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
        def test_torchscript(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            self._create_and_check_torchscript(config, inputs_dict)
        def test_torchscript_output_attentions(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            config.output_attentions = True
            self._create_and_check_torchscript(config, inputs_dict)
        def test_torchscript_output_hidden_state(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            config.output_hidden_states = True
            self._create_and_check_torchscript(config, inputs_dict)
        def _create_and_check_torchscript(self, config, inputs_dict):
            if not self.test_torchscript:
                return
 def _create_and_check_torchscript(tester, model_classes, config, inputs_dict):
            configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
            configs_no_init.torchscript = True
-    for model_class in model_classes:
+            for model_class in self.all_model_classes:
                model = model_class(config=configs_no_init)
                model.eval()
                inputs = inputs_dict['input_ids']  # Let's keep only input_ids
@@ -58,19 +129,19 @@ def _create_and_check_torchscript(tester, model_classes, config, inputs_dict):
                try:
                    torch.jit.trace(model, inputs)
                except RuntimeError:
-            tester.parent.fail("Couldn't trace module.")
+                    self.fail("Couldn't trace module.")
                try:
                    traced_gpt2 = torch.jit.trace(model, inputs)
                    torch.jit.save(traced_gpt2, "traced_model.pt")
                except RuntimeError:
-            tester.parent.fail("Couldn't save module.")
+                    self.fail("Couldn't save module.")
                try:
                    loaded_model = torch.jit.load("traced_model.pt")
                    os.remove("traced_model.pt")
                except ValueError:
-            tester.parent.fail("Couldn't load module.")
+                    self.fail("Couldn't load module.")
                model.eval()
                loaded_model.eval()
@@ -83,28 +154,22 @@ def _create_and_check_torchscript(tester, model_classes, config, inputs_dict):
                    if p1.data.ne(p2.data).sum() > 0:
                        models_equal = False
-        tester.parent.assertTrue(models_equal)
+                self.assertTrue(models_equal)
 def _create_and_check_initialization(tester, model_classes, config, inputs_dict):
    configs_no_init = _config_zero_init(config)
    for model_class in model_classes:
        model = model_class(config=configs_no_init)
        for name, param in model.named_parameters():
            if param.requires_grad:
                tester.parent.assertIn(param.data.mean().item(), [0.0, 1.0],
                                       msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
-def _create_and_check_for_headmasking(tester, model_classes, config, inputs_dict):
+        def test_headmasking(self):
-    configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-    for model_class in model_classes:
+
            config.output_attentions = True
            config.output_hidden_states = True
            configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
            for model_class in self.all_model_classes:
                model = model_class(config=configs_no_init)
                model.eval()
                # Prepare head_mask
                # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) 
-        head_mask = torch.ones(tester.num_hidden_layers, tester.num_attention_heads)
+                head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
                head_mask[0, 0] = 0
                head_mask[-1, :-1] = 0
                head_mask.requires_grad_(requires_grad=True)
@@ -124,158 +189,150 @@ def _create_and_check_for_headmasking(tester, model_classes, config, inputs_dict
                # Remove Nan
-        tester.parent.assertIsNotNone(multihead_outputs)
+                self.assertIsNotNone(multihead_outputs)
-        tester.parent.assertEqual(len(multihead_outputs), tester.num_hidden_layers)
+                self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
-        tester.parent.assertAlmostEqual(
+                self.assertAlmostEqual(
                    attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
-        tester.parent.assertNotEqual(
+                self.assertNotEqual(
                    attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
-        tester.parent.assertNotEqual(
+                self.assertNotEqual(
                    attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
-        tester.parent.assertAlmostEqual(
+                self.assertAlmostEqual(
                    attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
-        tester.parent.assertNotEqual(
+                self.assertNotEqual(
                    attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
-def _create_and_check_for_head_pruning(tester, model_classes, config, inputs_dict):
+        def test_head_pruning(self):
-    for model_class in model_classes:
+            if not self.test_pruning:
                return
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            for model_class in self.all_model_classes:
                config.output_attentions = True
                config.output_hidden_states = False
                model = model_class(config=config)
                model.eval()
-        heads_to_prune = {0: list(range(1, tester.num_attention_heads)),
+                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
                                -1: [0]}
                model.prune_heads(heads_to_prune)
                outputs = model(**inputs_dict)
                attentions = outputs[-1]
-        tester.parent.assertEqual(
+                self.assertEqual(
                    attentions[0].shape[-3], 1)
-        tester.parent.assertEqual(
+                self.assertEqual(
-            attentions[1].shape[-3], tester.num_attention_heads)
+                    attentions[1].shape[-3], self.model_tester.num_attention_heads)
-        tester.parent.assertEqual(
+                self.assertEqual(
-            attentions[-1].shape[-3], tester.num_attention_heads - 1)
+                    attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
-def _create_and_check_for_attentions(tester, model_classes, config, inputs_dict):
+        def test_hidden_states_output(self):
-    for model_class in model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        config.output_attentions = True
        config.output_hidden_states = False
        model = model_class(config)
        model.eval()
        outputs = model(**inputs_dict)
        attentions = outputs[-1]
        tester.parent.assertEqual(model.config.output_attentions, True)
        tester.parent.assertEqual(model.config.output_hidden_states, False)
        tester.parent.assertEqual(len(attentions), tester.num_hidden_layers)
        tester.parent.assertListEqual(
            list(attentions[0].shape[-3:]),
            [tester.num_attention_heads,
             tester.seq_length,
             tester.key_len if hasattr(tester, 'key_len') else tester.seq_length])
        out_len = len(outputs)
-        # Check attention is always last and order is fine
+            for model_class in self.all_model_classes:
        config.output_attentions = True
        config.output_hidden_states = True
        model = model_class(config)
        model.eval()
        outputs = model(**inputs_dict)
        tester.parent.assertEqual(out_len+1, len(outputs))
        tester.parent.assertEqual(model.config.output_attentions, True)
        tester.parent.assertEqual(model.config.output_hidden_states, True)
        attentions = outputs[-1]
        tester.parent.assertEqual(len(attentions), tester.num_hidden_layers)
        tester.parent.assertListEqual(
            list(attentions[0].shape[-3:]),
            [tester.num_attention_heads,
             tester.seq_length,
             tester.key_len if hasattr(tester, 'key_len') else tester.seq_length])
 def _create_and_check_for_hidden_states(tester, model_classes, config, inputs_dict):
    for model_class in model_classes:
                config.output_hidden_states = True
                config.output_attentions = False
                model = model_class(config)
                model.eval()
                outputs = model(**inputs_dict)
                hidden_states = outputs[-1]
-        tester.parent.assertEqual(model.config.output_attentions, False)
+                self.assertEqual(model.config.output_attentions, False)
-        tester.parent.assertEqual(model.config.output_hidden_states, True)
+                self.assertEqual(model.config.output_hidden_states, True)
-        tester.parent.assertEqual(len(hidden_states), tester.num_hidden_layers + 1)
+                self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
-        tester.parent.assertListEqual(
+                self.assertListEqual(
                    list(hidden_states[0].shape[-2:]),
-            [tester.seq_length, tester.hidden_size])
+                    [self.model_tester.seq_length, self.model_tester.hidden_size])
        def test_resize_tokens_embeddings(self):
            original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            if not self.test_resize_embeddings:
                return
            for model_class in self.all_model_classes:
                config = copy.deepcopy(original_config)
                model = model_class(config)
                model_vocab_size = config.vocab_size
                # Retrieve the embeddings and clone theme
                model_embed = model.resize_token_embeddings(model_vocab_size)
                cloned_embeddings = model_embed.weight.clone()
                # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
                model_embed = model.resize_token_embeddings(model_vocab_size + 10)
                self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
                # Check that it actually resizes the embeddings matrix
                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
                # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
                model_embed = model.resize_token_embeddings(model_vocab_size - 15)
                self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
                # Check that it actually resizes the embeddings matrix
                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
                # Check that adding and removing tokens has not modified the first part of the embedding matrix.
                models_equal = True
                for p1, p2 in zip(cloned_embeddings, model_embed.weight):
                    if p1.data.ne(p2.data).sum() > 0:
                        models_equal = False
                self.assertTrue(models_equal)
        def test_tie_model_weights(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            def check_same_values(layer_1, layer_2):
                equal = True
                for p1, p2 in zip(layer_1.weight, layer_2.weight):
                    if p1.data.ne(p2.data).sum() > 0:
                        equal = False
                return equal
            for model_class in self.all_model_classes:
                if not hasattr(model_class, 'tie_weights'):
                    continue
                config.torchscript = True
                model_not_tied = model_class(config)
                params_not_tied = list(model_not_tied.parameters())
                config_tied = copy.deepcopy(config)
                config_tied.torchscript = False
                model_tied = model_class(config_tied)
                params_tied = list(model_tied.parameters())
                # Check that the embedding layer and decoding layer are the same in size and in value
                self.assertGreater(len(params_not_tied), len(params_tied))
                # self.assertTrue(check_same_values(embeddings, decoding))
                # # Check that after modification, they remain the same.
                # embeddings.weight.data.div_(2)
                # # Check that the embedding layer and decoding layer are the same in size and in value
                # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
                # self.assertTrue(check_same_values(embeddings, decoding))
                # # Check that after modification, they remain the same.
                # decoding.weight.data.div_(4)
                # # Check that the embedding layer and decoding layer are the same in size and in value
                # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
                # self.assertTrue(check_same_values(embeddings, decoding))
                # Check that after resize they remain tied.
                model_tied.resize_token_embeddings(config.vocab_size + 10)
                params_tied_2 = list(model_tied.parameters())
                self.assertGreater(len(params_not_tied), len(params_tied))
                self.assertEqual(len(params_tied_2), len(params_tied))
                # decoding.weight.data.mul_(20)
                # # Check that the embedding layer and decoding layer are the same in size and in value
                # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
                # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
-def create_and_check_commons(tester, config, inputs_dict, test_pruning=True, test_torchscript=True):
+    class GPTModelTester(CommonModelTester):
    _create_and_check_initialization(tester, tester.all_model_classes, config, inputs_dict)
    _create_and_check_for_attentions(tester, tester.all_model_classes, config, inputs_dict)
    _create_and_check_for_headmasking(tester, tester.all_model_classes, config, inputs_dict)
    _create_and_check_for_hidden_states(tester, tester.all_model_classes, config, inputs_dict)
    if test_torchscript:
        _create_and_check_torchscript(tester, tester.all_model_classes, config, inputs_dict)
        _create_and_check_torchscript_output_attentions(tester, tester.all_model_classes, config, inputs_dict)
        _create_and_check_torchscript_output_hidden_state(tester, tester.all_model_classes, config, inputs_dict)
    if test_pruning:
        _create_and_check_for_head_pruning(tester, tester.all_model_classes, config, inputs_dict)
 def ids_tensor(shape, vocab_size, rng=None, name=None):
    """Creates a random int32 tensor of the shape within the vocab size."""
    if rng is None:
        rng = random.Random()
    total_dims = 1
    for dim in shape:
        total_dims *= dim
    values = []
    for _ in range(total_dims):
        values.append(rng.randint(0, vocab_size - 1))
    return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
 class ConfigTester(object):
    def __init__(self, parent, config_class=None, **kwargs):
        self.parent = parent
        self.config_class = config_class
        self.inputs_dict = kwargs
    def create_and_test_config_common_properties(self):
        config = self.config_class(**self.inputs_dict)
        self.parent.assertTrue(hasattr(config, 'vocab_size'))
        self.parent.assertTrue(hasattr(config, 'hidden_size'))
        self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
        self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
    def create_and_test_config_to_json_string(self):
        config = self.config_class(**self.inputs_dict)
        obj = json.loads(config.to_json_string())
        for key, value in self.inputs_dict.items():
            self.parent.assertEqual(obj[key], value)
    def create_and_test_config_to_json_file(self):
        config_first = self.config_class(**self.inputs_dict)
        json_file_path = "/tmp/config.json"
        config_first.to_json_file(json_file_path)
        config_second = self.config_class.from_json_file(json_file_path)
        os.remove(json_file_path)
        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
    def run_common_tests(self):
        self.create_and_test_config_common_properties()
        self.create_and_test_config_to_json_string()
        self.create_and_test_config_to_json_file()
 class GPTModelTester(object):
        def __init__(self,
                        parent,
                        batch_size=13,
@@ -424,10 +481,12 @@ class GPTModelTester(object):
                shutil.rmtree(cache_dir)
                self.parent.assertIsNotNone(model)
-    def create_and_check_commons(self, config, input_ids, token_type_ids, position_ids,
+        def prepare_config_and_inputs_for_common(self):
-                                    mc_labels, lm_labels, mc_token_ids):
+            config_and_inputs = self.prepare_config_and_inputs()
            (config, input_ids, token_type_ids, position_ids,
                mc_labels, lm_labels, mc_token_ids) = config_and_inputs
            inputs_dict = {'input_ids': input_ids}
-        create_and_check_commons(self, config, inputs_dict)
+            return config, inputs_dict
        def run_common_tests(self, test_presents=False):
            config_and_inputs = self.prepare_config_and_inputs()
@@ -443,13 +502,61 @@ class GPTModelTester(object):
                config_and_inputs = self.prepare_config_and_inputs()
                self.create_and_check_presents(*config_and_inputs)
        config_and_inputs = self.prepare_config_and_inputs()
        self.create_and_check_commons(*config_and_inputs)
        def run_slow_tests(self):
            self.create_and_check_model_from_pretrained()
 class ConfigTester(object):
    def __init__(self, parent, config_class=None, **kwargs):
        self.parent = parent
        self.config_class = config_class
        self.inputs_dict = kwargs
    def create_and_test_config_common_properties(self):
        config = self.config_class(**self.inputs_dict)
        self.parent.assertTrue(hasattr(config, 'vocab_size'))
        self.parent.assertTrue(hasattr(config, 'hidden_size'))
        self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
        self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
    def create_and_test_config_to_json_string(self):
        config = self.config_class(**self.inputs_dict)
        obj = json.loads(config.to_json_string())
        for key, value in self.inputs_dict.items():
            self.parent.assertEqual(obj[key], value)
    def create_and_test_config_to_json_file(self):
        config_first = self.config_class(**self.inputs_dict)
        json_file_path = "/tmp/config.json"
        config_first.to_json_file(json_file_path)
        config_second = self.config_class.from_json_file(json_file_path)
        os.remove(json_file_path)
        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
    def run_common_tests(self):
        self.create_and_test_config_common_properties()
        self.create_and_test_config_to_json_string()
        self.create_and_test_config_to_json_file()
 def ids_tensor(shape, vocab_size, rng=None, name=None):
    """Creates a random int32 tensor of the shape within the vocab size."""
    if rng is None:
        rng = random.Random()
    total_dims = 1
    for dim in shape:
        total_dims *= dim
    values = []
    for _ in range(total_dims):
        values.append(rng.randint(0, vocab_size - 1))
    return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
 class ModelUtilsTest(unittest.TestCase):
    def test_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
@@ -471,79 +578,6 @@ class ModelUtilsTest(unittest.TestCase):
            self.assertEqual(model.config.output_hidden_states, True)
            self.assertEqual(model.config, config)
    def test_resize_tokens_embeddings(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            config = BertConfig.from_pretrained(model_name)
            model = BertModel.from_pretrained(model_name)
            model_vocab_size = config.vocab_size
            # Retrieve the embeddings and clone theme
            cloned_embeddings = model.embeddings.word_embeddings.weight.clone()
            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
            model.resize_token_embeddings(model_vocab_size + 10)
            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
            # Check that it actually resizes the embeddings matrix
            self.assertEqual(model.embeddings.word_embeddings.weight.shape[0], cloned_embeddings.shape[0] + 10)
            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
            model.resize_token_embeddings(model_vocab_size)
            self.assertEqual(model.config.vocab_size, model_vocab_size)
            # Check that it actually resizes the embeddings matrix
            self.assertEqual(model.embeddings.word_embeddings.weight.shape[0], cloned_embeddings.shape[0])
            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
            models_equal = True
            for p1, p2 in zip(cloned_embeddings, model.embeddings.word_embeddings.weight):
                if p1.data.ne(p2.data).sum() > 0:
                    models_equal = False
            self.assertTrue(models_equal)
    def test_tie_model_weights(self):
        logging.basicConfig(level=logging.INFO)
        def check_same_values(layer_1, layer_2):
            equal = True
            for p1, p2 in zip(layer_1.weight, layer_2.weight):
                if p1.data.ne(p2.data).sum() > 0:
                    equal = False
            return equal
        for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            config = GPT2Config.from_pretrained(model_name)
            model = GPT2LMHeadModel.from_pretrained(model_name)
            # Get the embeddings and decoding layer
            embeddings = model.transformer.wte
            decoding = model.lm_head
            # Check that the embedding layer and decoding layer are the same in size and in value
            self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
            self.assertTrue(check_same_values(embeddings, decoding))
            # Check that after modification, they remain the same.
            embeddings.weight.data.div_(2)
            # Check that the embedding layer and decoding layer are the same in size and in value
            self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
            self.assertTrue(check_same_values(embeddings, decoding))
            # Check that after modification, they remain the same.
            decoding.weight.data.div_(4)
            # Check that the embedding layer and decoding layer are the same in size and in value
            self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
            self.assertTrue(check_same_values(embeddings, decoding))
            # Check that after resize they remain tied.
            model.resize_token_embeddings(config.vocab_size + 10)
            decoding.weight.data.mul_(20)
            # Check that the embedding layer and decoding layer are the same in size and in value
            self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
            self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
 if __name__ == "__main__":
    unittest.main()
--- a/pytorch_transformers/tests/modeling_gpt2_test.py
+++ b/pytorch_transformers/tests/modeling_gpt2_test.py
@@ -16,19 +16,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import os
 import unittest
 import json
 import random
 import shutil
 import pytest
 import torch
 from pytorch_transformers import (GPT2Config, GPT2Model,
                                  GPT2LMHeadModel, GPT2DoubleHeadsModel)
-from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester)
+from .modeling_common_test import CommonTestCases, ConfigTester
 class GPT2ModelTest(unittest.TestCase):
@@ -37,14 +32,14 @@ class GPT2ModelTest(unittest.TestCase):
        config_tester.run_common_tests()
    def test_model(self):
-        model_tester = GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
+        model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
                                            lm_head_model_class=GPT2LMHeadModel,
                                            double_head_model_class=GPT2DoubleHeadsModel)
        model_tester.run_common_tests(test_presents=True)
    @pytest.mark.slow
    def test_pretrained(self):
-        model_tester = GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
+        model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
                                            lm_head_model_class=GPT2LMHeadModel,
                                            double_head_model_class=GPT2DoubleHeadsModel)
        model_tester.run_slow_tests()
--- a/pytorch_transformers/tests/modeling_openai_test.py
+++ b/pytorch_transformers/tests/modeling_openai_test.py
@@ -19,12 +19,11 @@ from __future__ import print_function
 import unittest
 import pytest
 import torch
 from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
                                     OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
-from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester)
+from .modeling_common_test import CommonTestCases, ConfigTester
 class OpenAIModelTest(unittest.TestCase):
@@ -33,14 +32,14 @@ class OpenAIModelTest(unittest.TestCase):
        config_tester.run_common_tests()
    def test_model(self):
-        model_tester = GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
+        model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
                                           lm_head_model_class=OpenAIGPTLMHeadModel,
                                           double_head_model_class=OpenAIGPTDoubleHeadsModel)
        model_tester.run_common_tests(test_presents=False)
    @pytest.mark.slow
    def test_pretrained(self):
-        model_tester = GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
+        model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
                                           lm_head_model_class=OpenAIGPTLMHeadModel,
                                           double_head_model_class=OpenAIGPTDoubleHeadsModel)
        model_tester.run_slow_tests()
--- a/pytorch_transformers/tests/modeling_transfo_xl_test.py
+++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py
@@ -28,9 +28,15 @@ import torch
 from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
 from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor
+from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor
 class TransfoXLModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel)
    test_pruning = False
    test_torchscript = False
    test_resize_embeddings = False
 class TransfoXLModelTest(unittest.TestCase):
    class TransfoXLModelTester(object):
        def __init__(self,
@@ -52,7 +58,6 @@ class TransfoXLModelTest(unittest.TestCase):
                     num_hidden_layers=5,
                     scope=None,
                     seed=1,
                     all_model_classes=(TransfoXLModel, TransfoXLLMHeadModel),
                     ):
            self.parent = parent
            self.batch_size = batch_size
@@ -73,7 +78,6 @@ class TransfoXLModelTest(unittest.TestCase):
            self.num_hidden_layers = num_hidden_layers
            self.scope = scope
            self.seed = seed
            self.all_model_classes = all_model_classes
        def prepare_config_and_inputs(self):
            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -171,16 +175,31 @@ class TransfoXLModelTest(unittest.TestCase):
                list(list(mem.size()) for mem in result["mems_2"]),
                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
-        def create_and_check_transfo_xl_commons(self, config, input_ids_1, input_ids_2, lm_labels):
+        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
            (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
            inputs_dict = {'input_ids': input_ids_1}
-            create_and_check_commons(self, config, inputs_dict, test_pruning=False, test_torchscript=False)
+            return config, inputs_dict
-    def test_default(self):
+
-        self.run_tester(TransfoXLModelTest.TransfoXLModelTester(self))
+    def setUp(self):
        self.model_tester = TransfoXLModelTest.TransfoXLModelTester(self)
        self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
    def test_config(self):
-        config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
+        self.config_tester.run_common_tests()
-        config_tester.run_common_tests()
+
    def test_transfo_xl_model(self):
        self.model_tester.set_seed()
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        output_result = self.model_tester.create_transfo_xl_model(*config_and_inputs)
        self.model_tester.check_transfo_xl_model_output(output_result)
    def test_transfo_xl_lm_head(self):
        self.model_tester.set_seed()
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        output_result = self.model_tester.create_transfo_xl_lm_head(*config_and_inputs)
        self.model_tester.check_transfo_xl_lm_head_output(output_result)
    @pytest.mark.slow
    def test_model_from_pretrained(self):
@@ -190,23 +209,6 @@ class TransfoXLModelTest(unittest.TestCase):
            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
    def run_tester(self, tester):
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.set_seed()
        config_and_inputs = tester.prepare_config_and_inputs()
        output_result = tester.create_transfo_xl_model(*config_and_inputs)
        tester.check_transfo_xl_model_output(output_result)
        tester.set_seed()
        config_and_inputs = tester.prepare_config_and_inputs()
        output_result = tester.create_transfo_xl_lm_head(*config_and_inputs)
        tester.check_transfo_xl_lm_head_output(output_result)
        tester.set_seed()
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_transfo_xl_commons(*config_and_inputs)
 if __name__ == "__main__":
    unittest.main()
--- a/pytorch_transformers/tests/modeling_xlm_test.py
+++ b/pytorch_transformers/tests/modeling_xlm_test.py
@@ -23,10 +23,15 @@ import pytest
 from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
 from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor)
+from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
-class XLMModelTest(unittest.TestCase):
+class XLMModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (XLMModel, XLMWithLMHeadModel,  
                         XLMForQuestionAnswering, XLMForSequenceClassification) 
                         # , XLMForSequenceClassification, XLMForTokenClassification),
    class XLMModelTester(object):
        def __init__(self,
@@ -58,8 +63,6 @@ class XLMModelTest(unittest.TestCase):
                     summary_type="last",
                     use_proj=True,
                     scope=None,
                     all_model_classes = (XLMModel, XLMWithLMHeadModel,
                                          XLMForQuestionAnswering, XLMForSequenceClassification),  # , XLMForSequenceClassification, XLMForTokenClassification),
                    ):
            self.parent = parent
            self.batch_size = batch_size
@@ -90,7 +93,6 @@ class XLMModelTest(unittest.TestCase):
            self.num_labels = num_labels
            self.num_choices = num_choices
            self.scope = scope
            self.all_model_classes = all_model_classes
        def prepare_config_and_inputs(self):
            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -237,28 +239,23 @@ class XLMModelTest(unittest.TestCase):
                [self.batch_size, self.type_sequence_label_size])
-        def create_and_check_xlm_commons(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
            (config, input_ids, token_type_ids, input_lengths,
             sequence_labels, token_labels, is_impossible_labels, input_mask) = config_and_inputs
            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'lengths': input_lengths}
-            create_and_check_commons(self, config, inputs_dict)
+            return config, inputs_dict
-    def test_default(self):
+    def setUp(self):
-        self.run_tester(XLMModelTest.XLMModelTester(self))
+        self.model_tester = XLMModelTest.XLMModelTester(self)
        self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
    def test_config(self):
-        config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
+        self.config_tester.run_common_tests()
        config_tester.run_common_tests()
-    @pytest.mark.slow
+    def test_xlm_model(self):
-    def test_model_from_pretrained(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        self.model_tester.create_and_check_xlm_model(*config_and_inputs)
        for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
    def run_tester(self, tester):
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_xlm_model(*config_and_inputs)
        # config_and_inputs = tester.prepare_config_and_inputs()
        # tester.create_and_check_xlm_for_masked_lm(*config_and_inputs)
@@ -275,8 +272,14 @@ class XLMModelTest(unittest.TestCase):
        # config_and_inputs = tester.prepare_config_and_inputs()
        # tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
-        config_and_inputs = tester.prepare_config_and_inputs()
+    @pytest.mark.slow
-        tester.create_and_check_xlm_commons(*config_and_inputs)
+    def test_model_from_pretrained(self):
        cache_dir = "/tmp/pytorch_transformers_test/"
        for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
 if __name__ == "__main__":
    unittest.main()
--- a/pytorch_transformers/tests/modeling_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -28,9 +28,14 @@ import torch
 from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
 from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor
+from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor
 class XLNetModelTest(CommonTestCases.CommonModelTester):
    all_model_classes=(XLNetModel, XLNetLMHeadModel,
                    XLNetForSequenceClassification, XLNetForQuestionAnswering)
    test_pruning = False
 class XLNetModelTest(unittest.TestCase):
    class XLNetModelTester(object):
        def __init__(self,
@@ -56,8 +61,6 @@ class XLNetModelTest(unittest.TestCase):
                     initializer_range=0.05,
                     seed=1,
                     type_vocab_size=2,
                     all_model_classes=(XLNetModel, XLNetLMHeadModel,
                                        XLNetForSequenceClassification, XLNetForQuestionAnswering),
            ):
            self.parent = parent
            self.batch_size = batch_size
@@ -82,7 +85,6 @@ class XLNetModelTest(unittest.TestCase):
            self.seed = seed
            self.type_vocab_size = type_vocab_size
            self.type_sequence_label_size = type_sequence_label_size
            self.all_model_classes = all_model_classes
        def prepare_config_and_inputs(self):
            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -264,17 +266,41 @@ class XLNetModelTest(unittest.TestCase):
                list(list(mem.size()) for mem in result["mems_1"]),
                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
-        def create_and_check_xlnet_commons(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+        def prepare_config_and_inputs_for_common(self):
-                target_mapping, inp_q, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+            config_and_inputs = self.prepare_config_and_inputs()
            (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
                target_mapping, inp_q, segment_ids, lm_labels,
                sequence_labels, is_impossible_labels) = config_and_inputs
            inputs_dict = {'input_ids': input_ids_1}
-            create_and_check_commons(self, config, inputs_dict, test_pruning=False)
+            return config, inputs_dict
-    def test_default(self):
+
-        self.run_tester(XLNetModelTest.XLNetModelTester(self))
+    def setUp(self):
        self.model_tester = XLNetModelTest.XLNetModelTester(self)
        self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
    def test_config(self):
-        config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
+        self.config_tester.run_common_tests()
-        config_tester.run_common_tests()
+
    def test_xlnet_base_model(self):
        self.model_tester.set_seed()
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_xlnet_base_model(*config_and_inputs)
    def test_xlnet_lm_head(self):
        self.model_tester.set_seed()
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs) 
    def test_xlnet_sequence_classif(self):
        self.model_tester.set_seed()
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
    def test_xlnet_qa(self):
        self.model_tester.set_seed()
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
    @pytest.mark.slow
    def test_model_from_pretrained(self):
@@ -284,27 +310,6 @@ class XLNetModelTest(unittest.TestCase):
            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
    def run_tester(self, tester):
        tester.set_seed()
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_xlnet_base_model(*config_and_inputs)
        tester.set_seed()
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_xlnet_lm_head(*config_and_inputs) 
        tester.set_seed()
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
        tester.set_seed()
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_xlnet_qa(*config_and_inputs)
        tester.set_seed()
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_xlnet_commons(*config_and_inputs)
 if __name__ == "__main__":
    unittest.main()