updating tests

2019-07-12 10:57:58 +02:00
parent 3fbceed8d2
commit 2918b7d2a0
14 changed files with 672 additions and 596 deletions
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -617,6 +617,7 @@ class BertModel(BertPreTrainedModel):
        old_embeddings = self.embeddings.word_embeddings
        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
        self.embeddings.word_embeddings = new_embeddings
        return self.embeddings.word_embeddings
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
@@ -758,11 +759,8 @@ class BertForPreTraining(BertPreTrainedModel):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
-        input_embeddings = self.bert.embeddings.word_embeddings.weight
+        self._tie_or_clone_weights(self.cls.predictions.decoder,
-        if self.config.torchscript:
+                                   self.bert.embeddings.word_embeddings)
            self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
        else:
            self.cls.predictions.decoder.weight = input_embeddings  # Tied weights
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
                next_sentence_label=None, head_mask=None):
@@ -864,11 +862,8 @@ class BertForMaskedLM(BertPreTrainedModel):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
-        input_embeddings = self.bert.embeddings.word_embeddings.weight
+        self._tie_or_clone_weights(self.cls.predictions.decoder,
-        if self.config.torchscript:
+                                   self.bert.embeddings.word_embeddings)
            self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
        else:
            self.cls.predictions.decoder.weight = input_embeddings  # Tied weights
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
        """
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -414,6 +414,7 @@ class GPT2Model(GPT2PreTrainedModel):
    def _resize_token_embeddings(self, new_num_tokens):
        self.wte = self._get_resized_embeddings(self.wte, new_num_tokens)
        return self.wte
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
@@ -562,11 +563,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
-        input_embeddings = self.transformer.wte.weight
+        self._tie_or_clone_weights(self.lm_head,
-        if self.config.torchscript:
+                                   self.transformer.wte)
            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
        else:
            self.lm_head.weight = input_embeddings  # Tied weights
    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
        """
@@ -658,11 +656,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
-        input_embeddings = self.transformer.wte.weight
+        self._tie_or_clone_weights(self.lm_head,
-        if self.config.torchscript:
+                                   self.transformer.wte)
            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
        else:
            self.lm_head.weight = input_embeddings  # Tied weights
    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                position_ids=None, past=None, head_mask=None):
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -430,6 +430,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
    def _resize_token_embeddings(self, new_num_tokens):
        self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens)
        return self.tokens_embed
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
@@ -583,11 +584,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
-        input_embeddings = self.transformer.tokens_embed.weight
+        self._tie_or_clone_weights(self.lm_head,
-        if self.config.torchscript:
+                                   self.transformer.tokens_embed)
            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
        else:
            self.lm_head.weight = input_embeddings  # Tied weights
    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
        """
@@ -696,11 +694,8 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
-        input_embeddings = self.transformer.tokens_embed.weight
+        self._tie_or_clone_weights(self.lm_head,
-        if self.config.torchscript:
+                                   self.transformer.tokens_embed)
            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
        else:
            self.lm_head.weight = input_embeddings  # Tied weights
    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                position_ids=None, head_mask=None):
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -291,6 +291,10 @@ class TransfoXLConfig(PretrainedConfig):
    def vocab_size(self):
        return self.n_token
    @vocab_size.setter
    def vocab_size(self, value):
        self.n_token = value
    @property
    def hidden_size(self):
        return self.d_model
@@ -1003,7 +1007,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
        self.apply(self.init_weights)
    def _resize_token_embeddings(self, new_num_tokens):
-        raise NotImplementedError
+        return self.word_emb
    def backward_compatible(self):
        self.sample_softmax = -1
@@ -1280,13 +1284,20 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
        else:
            if self.config.tie_weight:
                for i in range(len(self.crit.out_layers)):
-                    self.crit.out_layers[i].weight = self.transformer.word_emb.emb_layers[i].weight
+                    self._tie_or_clone_weights(self.crit.out_layers[i],
                                               self.transformer.word_emb.emb_layers[i])
            if self.config.tie_projs:
                for i, tie_proj in enumerate(self.config.tie_projs):
                    if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
-                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
+                        if self.config.torchscript:
                            self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[0].clone())
                        else:
                            self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
                    elif tie_proj and self.config.div_val != 1:
-                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
+                        if self.config.torchscript:
                            self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[i].clone())
                        else:
                            self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
    def reset_length(self, tgt_len, ext_len, mem_len):
        self.transformer.reset_length(tgt_len, ext_len, mem_len)
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -165,9 +165,27 @@ class PreTrainedModel(nn.Module):
        # Save config in model
        self.config = config
-    def _get_resized_embeddings(self, old_embeddings, new_num_tokens):
+    def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
-        # Build new embeddings
+        """ Build a resized Embedding Module from a provided token Embedding Module.
            Increasing the size will add newly initialized vectors at the end
            Reducing the size will remove vectors from the end
        Args:
            new_num_tokens: (Optional) New number of tokens in the embedding matrix.
                Increasing the size will add newly initialized vectors at the end
                Reducing the size will remove vectors from the end
                If not provided or None: return the provided token Embedding Module.
        Return:
            Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
        """
        if new_num_tokens is None:
            return old_embeddings
        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
        if old_num_tokens == new_num_tokens:
            return old_embeddings
        # Build new embeddings
        new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
        new_embeddings.to(old_embeddings.weight.device)
@@ -180,18 +198,29 @@ class PreTrainedModel(nn.Module):
        return new_embeddings
-    def resize_token_embeddings(self, new_num_tokens):
+    def _tie_or_clone_weights(self, first_module, second_module):
-        """ Resize input token embeddings matrix.
+        """ Tie or clone module weights depending of weither we are using TorchScript or not
        """
        if self.config.torchscript:
            first_module.weight = nn.Parameter(second_module.weight.clone())
        else:
            first_module.weight = second_module.weight
    def resize_token_embeddings(self, new_num_tokens=None):
        """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
        Args:
-            new_num_tokens: New number of tokens in the embedding matrix.
+            new_num_tokens: (Optional) New number of tokens in the embedding matrix.
                Increasing the size will add newly initialized vectors at the end
                Reducing the size will remove vectors from the end
                If not provided or None: does nothing.
        Return:
            Pointer to the input tokens Embedding Module of the model
        """
        if new_num_tokens == self.config.vocab_size:
            return
        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
-        base_model._resize_token_embeddings(new_num_tokens)
+        model_embeds = base_model._resize_token_embeddings(new_num_tokens)
        if new_num_tokens is None:
            return model_embeds
        # Update base model and current model config
        self.config.vocab_size = new_num_tokens
@@ -201,6 +230,8 @@ class PreTrainedModel(nn.Module):
        if hasattr(self, 'tie_weights'):
            self.tie_weights()
        return model_embeds
    def prune_heads(self, heads_to_prune):
        """ Prunes heads of the base model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -184,6 +184,10 @@ class XLMConfig(PretrainedConfig):
    def vocab_size(self):
        return self.n_words
    @vocab_size.setter
    def vocab_size(self, value):
        self.n_words = value
    @property
    def hidden_size(self):
        return self.emb_dim
@@ -479,6 +483,7 @@ class XLMModel(XLMPreTrainedModel):
    def _resize_token_embeddings(self, new_num_tokens):
        self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
        return self.embeddings
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
@@ -728,10 +733,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
    def tie_weights(self):
        """ Make sure we are sharing the embeddings
        """
-        if self.config.torchscript:
+        self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
            self.pred_layer.proj.weight = nn.Parameter(self.transformer.embeddings.weight.clone())
        else:
            self.pred_layer.proj.weight = self.transformer.embeddings.weight
    def forward(self, input_ids, lengths=None, positions=None, langs=None, token_type_ids=None,
                attention_mask=None, cache=None, labels=None, head_mask=None):
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -316,6 +316,10 @@ class XLNetConfig(PretrainedConfig):
    def vocab_size(self):
        return self.n_token
    @vocab_size.setter
    def vocab_size(self, value):
        self.n_token = value
    @property
    def hidden_size(self):
        return self.d_model
@@ -660,10 +664,10 @@ class XLNetModel(XLNetPreTrainedModel):
    def _resize_token_embeddings(self, new_num_tokens):
        self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
        return self.word_embedding
    def _prune_heads(self, heads_to_prune):
-        logger.info("Head pruning is not implemented for XLNet")
+        raise NotImplementedError
        pass
    def create_mask(self, qlen, mlen):
        """
@@ -987,10 +991,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
    def tie_weights(self):
        """ Make sure we are sharing the embeddings
        """
-        if self.config.torchscript:
+        self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding)
            self.lm_loss.weight = nn.Parameter(self.transformer.word_embedding.weight.clone())
        else:
            self.lm_loss.weight = self.transformer.word_embedding.weight
    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
--- a/pytorch_transformers/tests/modeling_bert_test.py
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -26,10 +26,15 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
                                     BertForTokenClassification, BertForMultipleChoice)
 from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor)
+from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
-class BertModelTest(unittest.TestCase):
+class BertModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
            BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
            BertForTokenClassification)
    class BertModelTester(object):
        def __init__(self,
@@ -55,9 +60,6 @@ class BertModelTest(unittest.TestCase):
                     num_labels=3,
                     num_choices=4,
                     scope=None,
                     all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
                             BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
                             BertForTokenClassification),
                    ):
            self.parent = parent
            self.batch_size = batch_size
@@ -81,7 +83,6 @@ class BertModelTest(unittest.TestCase):
            self.num_labels = num_labels
            self.num_choices = num_choices
            self.scope = scope
            self.all_model_classes = all_model_classes
        def prepare_config_and_inputs(self):
            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -253,16 +254,51 @@ class BertModelTest(unittest.TestCase):
            self.check_loss_output(result)
-        def create_and_check_bert_commons(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
            (config, input_ids, token_type_ids, input_mask,
             sequence_labels, token_labels, choice_labels) = config_and_inputs
            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
-            create_and_check_commons(self, config, inputs_dict)
+            return config, inputs_dict
-    def test_default(self):
+    def setUp(self):
-        self.run_tester(BertModelTest.BertModelTester(self))
+        self.model_tester = BertModelTest.BertModelTester(self)
        self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
    def test_config(self):
-        config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
+        self.config_tester.run_common_tests()
-        config_tester.run_common_tests()
+
    def test_bert_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_model(*config_and_inputs)
    def test_for_masked_lm(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
    def test_for_multiple_choice(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
    def test_for_next_sequence_prediction(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs)
    def test_for_pretraining(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_pretraining(*config_and_inputs)
    def test_for_question_answering(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_question_answering(*config_and_inputs)
    def test_for_sequence_classification(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_sequence_classification(*config_and_inputs)
    def test_for_token_classification(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
    @pytest.mark.slow
    def test_model_from_pretrained(self):
@@ -272,33 +308,5 @@ class BertModelTest(unittest.TestCase):
            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
    def run_tester(self, tester):
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_bert_model(*config_and_inputs)
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs)
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_bert_for_pretraining(*config_and_inputs)
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_bert_for_question_answering(*config_and_inputs)
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_bert_for_sequence_classification(*config_and_inputs)
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_bert_for_token_classification(*config_and_inputs)
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_bert_commons(*config_and_inputs)
 if __name__ == "__main__":
    unittest.main()
--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -39,207 +39,471 @@ def _config_zero_init(config):
            setattr(configs_no_init, key, 0.0)
    return configs_no_init
-def _create_and_check_torchscript_output_attentions(tester, model_classes, config, inputs_dict):
+class CommonTestCases:
    config.output_attentions = True
    _create_and_check_torchscript(tester, model_classes, config, inputs_dict)
-def _create_and_check_torchscript_output_hidden_state(tester, model_classes, config, inputs_dict):
+    class CommonModelTester(unittest.TestCase):
    config.output_hidden_states = True
    _create_and_check_torchscript(tester, model_classes, config, inputs_dict)
-def _create_and_check_torchscript(tester, model_classes, config, inputs_dict):
+        model_tester = None
-    configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        all_model_classes = ()
-    configs_no_init.torchscript = True
+        test_torchscript = True
-    for model_class in model_classes:
+        test_pruning = True
-        model = model_class(config=configs_no_init)
+        test_resize_embeddings = True
        model.eval()
        inputs = inputs_dict['input_ids']  # Let's keep only input_ids
-        try:
+        def test_initialization(self):
-            torch.jit.trace(model, inputs)
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        except RuntimeError:
            tester.parent.fail("Couldn't trace module.")
-        try:
+            configs_no_init = _config_zero_init(config)
-            traced_gpt2 = torch.jit.trace(model, inputs)
+            for model_class in self.all_model_classes:
-            torch.jit.save(traced_gpt2, "traced_model.pt")
+                model = model_class(config=configs_no_init)
-        except RuntimeError:
+                for name, param in model.named_parameters():
-            tester.parent.fail("Couldn't save module.")
+                    if param.requires_grad:
                        self.assertIn(param.data.mean().item(), [0.0, 1.0],
                        msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
-        try:
+        def test_attention_outputs(self):
-            loaded_model = torch.jit.load("traced_model.pt")
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            os.remove("traced_model.pt")
        except ValueError:
            tester.parent.fail("Couldn't load module.")
-        model.eval()
+            for model_class in self.all_model_classes:
-        loaded_model.eval()
+                config.output_attentions = True
                config.output_hidden_states = False
                model = model_class(config)
                model.eval()
                outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(model.config.output_attentions, True)
                self.assertEqual(model.config.output_hidden_states, False)
                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads,
                    self.model_tester.seq_length,
                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
                out_len = len(outputs)
-        model_params = model.parameters()
+                # Check attention is always last and order is fine
-        loaded_model_params = loaded_model.parameters()
+                config.output_attentions = True
                config.output_hidden_states = True
                model = model_class(config)
                model.eval()
                outputs = model(**inputs_dict)
                self.assertEqual(out_len+1, len(outputs))
                self.assertEqual(model.config.output_attentions, True)
                self.assertEqual(model.config.output_hidden_states, True)
-        models_equal = True
+                attentions = outputs[-1]
-        for p1, p2 in zip(model_params, loaded_model_params):
+                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-            if p1.data.ne(p2.data).sum() > 0:
+                self.assertListEqual(
-                models_equal = False
+                    list(attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads,
                    self.model_tester.seq_length,
                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
-        tester.parent.assertTrue(models_equal)
+        def test_torchscript(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-def _create_and_check_initialization(tester, model_classes, config, inputs_dict):
+            self._create_and_check_torchscript(config, inputs_dict)
    configs_no_init = _config_zero_init(config)
    for model_class in model_classes:
        model = model_class(config=configs_no_init)
        for name, param in model.named_parameters():
            if param.requires_grad:
                tester.parent.assertIn(param.data.mean().item(), [0.0, 1.0],
                                       msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
-def _create_and_check_for_headmasking(tester, model_classes, config, inputs_dict):
+        def test_torchscript_output_attentions(self):
-    configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
    for model_class in model_classes:
        config.output_attentions = True
        config.output_hidden_states = True
        model = model_class(config=configs_no_init)
        model.eval()
-        # Prepare head_mask
+            config.output_attentions = True
-        # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) 
+            self._create_and_check_torchscript(config, inputs_dict)
        head_mask = torch.ones(tester.num_hidden_layers, tester.num_attention_heads)
        head_mask[0, 0] = 0
        head_mask[-1, :-1] = 0
        head_mask.requires_grad_(requires_grad=True)
        inputs = inputs_dict.copy()
        inputs['head_mask'] = head_mask
-        outputs = model(**inputs)
+        def test_torchscript_output_hidden_state(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        # Test that we can get a gradient back for importance score computation
+            config.output_hidden_states = True
-        output = sum(t.sum() for t in outputs[0])
+            self._create_and_check_torchscript(config, inputs_dict)
        output = output.sum()
        output.backward()
        multihead_outputs = head_mask.grad
-        attentions = outputs[-1]
+        def _create_and_check_torchscript(self, config, inputs_dict):
-        hidden_states = outputs[-2]
+            if not self.test_torchscript:
                return
-        # Remove Nan
+            configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
            configs_no_init.torchscript = True
            for model_class in self.all_model_classes:
                model = model_class(config=configs_no_init)
                model.eval()
                inputs = inputs_dict['input_ids']  # Let's keep only input_ids
-        tester.parent.assertIsNotNone(multihead_outputs)
+                try:
-        tester.parent.assertEqual(len(multihead_outputs), tester.num_hidden_layers)
+                    torch.jit.trace(model, inputs)
-        tester.parent.assertAlmostEqual(
+                except RuntimeError:
-            attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
+                    self.fail("Couldn't trace module.")
-        tester.parent.assertNotEqual(
+
-            attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
+                try:
-        tester.parent.assertNotEqual(
+                    traced_gpt2 = torch.jit.trace(model, inputs)
-            attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
+                    torch.jit.save(traced_gpt2, "traced_model.pt")
-        tester.parent.assertAlmostEqual(
+                except RuntimeError:
-            attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
+                    self.fail("Couldn't save module.")
-        tester.parent.assertNotEqual(
+
-            attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
+                try:
                    loaded_model = torch.jit.load("traced_model.pt")
                    os.remove("traced_model.pt")
                except ValueError:
                    self.fail("Couldn't load module.")
                model.eval()
                loaded_model.eval()
                model_params = model.parameters()
                loaded_model_params = loaded_model.parameters()
                models_equal = True
                for p1, p2 in zip(model_params, loaded_model_params):
                    if p1.data.ne(p2.data).sum() > 0:
                        models_equal = False
                self.assertTrue(models_equal)
-def _create_and_check_for_head_pruning(tester, model_classes, config, inputs_dict):
+        def test_headmasking(self):
-    for model_class in model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        config.output_attentions = True
        config.output_hidden_states = False
        model = model_class(config=config)
        model.eval()
        heads_to_prune = {0: list(range(1, tester.num_attention_heads)),
                          -1: [0]}
        model.prune_heads(heads_to_prune)
        outputs = model(**inputs_dict)
-        attentions = outputs[-1]
+            config.output_attentions = True
            config.output_hidden_states = True
            configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
            for model_class in self.all_model_classes:
                model = model_class(config=configs_no_init)
                model.eval()
-        tester.parent.assertEqual(
+                # Prepare head_mask
-            attentions[0].shape[-3], 1)
+                # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) 
-        tester.parent.assertEqual(
+                head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
-            attentions[1].shape[-3], tester.num_attention_heads)
+                head_mask[0, 0] = 0
-        tester.parent.assertEqual(
+                head_mask[-1, :-1] = 0
-            attentions[-1].shape[-3], tester.num_attention_heads - 1)
+                head_mask.requires_grad_(requires_grad=True)
                inputs = inputs_dict.copy()
                inputs['head_mask'] = head_mask
                outputs = model(**inputs)
                # Test that we can get a gradient back for importance score computation
                output = sum(t.sum() for t in outputs[0])
                output = output.sum()
                output.backward()
                multihead_outputs = head_mask.grad
                attentions = outputs[-1]
                hidden_states = outputs[-2]
                # Remove Nan
                self.assertIsNotNone(multihead_outputs)
                self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
                self.assertAlmostEqual(
                    attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
                self.assertNotEqual(
                    attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
                self.assertNotEqual(
                    attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
                self.assertAlmostEqual(
                    attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
                self.assertNotEqual(
                    attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
-def _create_and_check_for_attentions(tester, model_classes, config, inputs_dict):
+        def test_head_pruning(self):
-    for model_class in model_classes:
+            if not self.test_pruning:
-        config.output_attentions = True
+                return
        config.output_hidden_states = False
        model = model_class(config)
        model.eval()
        outputs = model(**inputs_dict)
        attentions = outputs[-1]
        tester.parent.assertEqual(model.config.output_attentions, True)
        tester.parent.assertEqual(model.config.output_hidden_states, False)
        tester.parent.assertEqual(len(attentions), tester.num_hidden_layers)
        tester.parent.assertListEqual(
            list(attentions[0].shape[-3:]),
            [tester.num_attention_heads,
             tester.seq_length,
             tester.key_len if hasattr(tester, 'key_len') else tester.seq_length])
        out_len = len(outputs)
-        # Check attention is always last and order is fine
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        config.output_attentions = True
        config.output_hidden_states = True
        model = model_class(config)
        model.eval()
        outputs = model(**inputs_dict)
        tester.parent.assertEqual(out_len+1, len(outputs))
        tester.parent.assertEqual(model.config.output_attentions, True)
        tester.parent.assertEqual(model.config.output_hidden_states, True)
-        attentions = outputs[-1]
+            for model_class in self.all_model_classes:
-        tester.parent.assertEqual(len(attentions), tester.num_hidden_layers)
+                config.output_attentions = True
-        tester.parent.assertListEqual(
+                config.output_hidden_states = False
-            list(attentions[0].shape[-3:]),
+                model = model_class(config=config)
-            [tester.num_attention_heads,
+                model.eval()
-             tester.seq_length,
+                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
-             tester.key_len if hasattr(tester, 'key_len') else tester.seq_length])
+                                -1: [0]}
                model.prune_heads(heads_to_prune)
                outputs = model(**inputs_dict)
-def _create_and_check_for_hidden_states(tester, model_classes, config, inputs_dict):
+                attentions = outputs[-1]
-    for model_class in model_classes:
+
-        config.output_hidden_states = True
+                self.assertEqual(
-        config.output_attentions = False
+                    attentions[0].shape[-3], 1)
-        model = model_class(config)
+                self.assertEqual(
-        model.eval()
+                    attentions[1].shape[-3], self.model_tester.num_attention_heads)
-        outputs = model(**inputs_dict)
+                self.assertEqual(
-        hidden_states = outputs[-1]
+                    attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
        tester.parent.assertEqual(model.config.output_attentions, False)
        tester.parent.assertEqual(model.config.output_hidden_states, True)
        tester.parent.assertEqual(len(hidden_states), tester.num_hidden_layers + 1)
        tester.parent.assertListEqual(
            list(hidden_states[0].shape[-2:]),
            [tester.seq_length, tester.hidden_size])
-def create_and_check_commons(tester, config, inputs_dict, test_pruning=True, test_torchscript=True):
+        def test_hidden_states_output(self):
-    _create_and_check_initialization(tester, tester.all_model_classes, config, inputs_dict)
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
    _create_and_check_for_attentions(tester, tester.all_model_classes, config, inputs_dict)
    _create_and_check_for_headmasking(tester, tester.all_model_classes, config, inputs_dict)
    _create_and_check_for_hidden_states(tester, tester.all_model_classes, config, inputs_dict)
-    if test_torchscript:
+            for model_class in self.all_model_classes:
-        _create_and_check_torchscript(tester, tester.all_model_classes, config, inputs_dict)
+                config.output_hidden_states = True
-        _create_and_check_torchscript_output_attentions(tester, tester.all_model_classes, config, inputs_dict)
+                config.output_attentions = False
-        _create_and_check_torchscript_output_hidden_state(tester, tester.all_model_classes, config, inputs_dict)
+                model = model_class(config)
                model.eval()
                outputs = model(**inputs_dict)
                hidden_states = outputs[-1]
                self.assertEqual(model.config.output_attentions, False)
                self.assertEqual(model.config.output_hidden_states, True)
                self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
                self.assertListEqual(
                    list(hidden_states[0].shape[-2:]),
                    [self.model_tester.seq_length, self.model_tester.hidden_size])
-    if test_pruning:
+        def test_resize_tokens_embeddings(self):
-        _create_and_check_for_head_pruning(tester, tester.all_model_classes, config, inputs_dict)
+            original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            if not self.test_resize_embeddings:
                return
            for model_class in self.all_model_classes:
                config = copy.deepcopy(original_config)
                model = model_class(config)
                model_vocab_size = config.vocab_size
                # Retrieve the embeddings and clone theme
                model_embed = model.resize_token_embeddings(model_vocab_size)
                cloned_embeddings = model_embed.weight.clone()
                # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
                model_embed = model.resize_token_embeddings(model_vocab_size + 10)
                self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
                # Check that it actually resizes the embeddings matrix
                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
                # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
                model_embed = model.resize_token_embeddings(model_vocab_size - 15)
                self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
                # Check that it actually resizes the embeddings matrix
                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
                # Check that adding and removing tokens has not modified the first part of the embedding matrix.
                models_equal = True
                for p1, p2 in zip(cloned_embeddings, model_embed.weight):
                    if p1.data.ne(p2.data).sum() > 0:
                        models_equal = False
                self.assertTrue(models_equal)
        def test_tie_model_weights(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            def check_same_values(layer_1, layer_2):
                equal = True
                for p1, p2 in zip(layer_1.weight, layer_2.weight):
                    if p1.data.ne(p2.data).sum() > 0:
                        equal = False
                return equal
            for model_class in self.all_model_classes:
                if not hasattr(model_class, 'tie_weights'):
                    continue
                config.torchscript = True
                model_not_tied = model_class(config)
                params_not_tied = list(model_not_tied.parameters())
                config_tied = copy.deepcopy(config)
                config_tied.torchscript = False
                model_tied = model_class(config_tied)
                params_tied = list(model_tied.parameters())
                # Check that the embedding layer and decoding layer are the same in size and in value
                self.assertGreater(len(params_not_tied), len(params_tied))
                # self.assertTrue(check_same_values(embeddings, decoding))
                # # Check that after modification, they remain the same.
                # embeddings.weight.data.div_(2)
                # # Check that the embedding layer and decoding layer are the same in size and in value
                # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
                # self.assertTrue(check_same_values(embeddings, decoding))
                # # Check that after modification, they remain the same.
                # decoding.weight.data.div_(4)
                # # Check that the embedding layer and decoding layer are the same in size and in value
                # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
                # self.assertTrue(check_same_values(embeddings, decoding))
                # Check that after resize they remain tied.
                model_tied.resize_token_embeddings(config.vocab_size + 10)
                params_tied_2 = list(model_tied.parameters())
                self.assertGreater(len(params_not_tied), len(params_tied))
                self.assertEqual(len(params_tied_2), len(params_tied))
                # decoding.weight.data.mul_(20)
                # # Check that the embedding layer and decoding layer are the same in size and in value
                # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
                # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
-def ids_tensor(shape, vocab_size, rng=None, name=None):
+    class GPTModelTester(CommonModelTester):
    """Creates a random int32 tensor of the shape within the vocab size."""
    if rng is None:
        rng = random.Random()
-    total_dims = 1
+        def __init__(self,
-    for dim in shape:
+                        parent,
-        total_dims *= dim
+                        batch_size=13,
                        seq_length=7,
                        is_training=True,
                        use_position_ids=True,
                        use_token_type_ids=True,
                        use_labels=True,
                        vocab_size=99,
                        n_positions=33,
                        hidden_size=32,
                        num_hidden_layers=5,
                        num_attention_heads=4,
                        n_choices=3,
                        type_sequence_label_size=2,
                        initializer_range=0.02,
                        num_labels=3,
                        scope=None,
                        config_class=None,
                        base_model_class=None,
                        lm_head_model_class=None,
                        double_head_model_class=None,
                        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
            self.is_training = is_training
            self.use_position_ids = use_position_ids
            self.use_token_type_ids = use_token_type_ids
            self.use_labels = use_labels
            self.vocab_size = vocab_size
            self.n_positions = n_positions
            self.hidden_size = hidden_size
            self.num_hidden_layers = num_hidden_layers
            self.num_attention_heads = num_attention_heads
            self.n_choices = n_choices
            self.type_sequence_label_size = type_sequence_label_size
            self.initializer_range = initializer_range
            self.num_labels = num_labels
            self.scope = scope
            self.config_class = config_class
            self.base_model_class = base_model_class
            self.lm_head_model_class = lm_head_model_class
            self.double_head_model_class = double_head_model_class
            self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class)
-    values = []
+        def prepare_config_and_inputs(self):
-    for _ in range(total_dims):
+            total_num_tokens = self.vocab_size
-        values.append(rng.randint(0, vocab_size - 1))
+            input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
-    return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
+            position_ids = None
            if self.use_position_ids:
                position_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
            token_type_ids = None
            if self.use_token_type_ids:
                total_voc = self.vocab_size
                token_type_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
            mc_labels = None
            lm_labels = None
            mc_token_ids = None
            if self.use_labels:
                mc_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
                lm_labels = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
                mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
            config = self.config_class(
                vocab_size_or_config_json_file=self.vocab_size,
                n_positions=self.n_positions,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
                initializer_range=self.initializer_range)
            return (config, input_ids, token_type_ids, position_ids,
                    mc_labels, lm_labels, mc_token_ids)
        def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
                                mc_labels, lm_labels, mc_token_ids):
            model = self.base_model_class(config)
            model.eval()
            outputs = model(input_ids, position_ids, token_type_ids)
            outputs = model(input_ids, position_ids)
            outputs = model(input_ids)
            hidden_state = outputs[0]
            self.parent.assertListEqual(
                list(hidden_state.size()),
                [self.batch_size, self.n_choices, self.seq_length, self.hidden_size])
        def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
                                        mc_labels, lm_labels, mc_token_ids):
            model = self.lm_head_model_class(config)
            model.eval()
            outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
            loss, lm_logits = outputs[:2]
            total_voc = self.vocab_size
            self.parent.assertListEqual(
                list(lm_logits.size()),
                [self.batch_size, self.n_choices, self.seq_length, total_voc])
            self.parent.assertListEqual(
                list(loss.size()),
                [])
        def create_and_check_presents(self, config, input_ids, token_type_ids, position_ids,
                                        mc_labels, lm_labels, mc_token_ids):
            for model_class in self.all_model_classes:
                model = model_class(config)
                model.eval()
                outputs = model(input_ids)
                presents = outputs[-1]
                self.parent.assertEqual(self.num_hidden_layers, len(presents))
                self.parent.assertListEqual(
                    list(presents[0].size()),
                    [2, self.batch_size * self.n_choices, self.num_attention_heads,
                        self.seq_length, self.hidden_size // self.num_attention_heads])
        def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
                                        mc_labels, lm_labels, mc_token_ids):
            model = self.double_head_model_class(config)
            model.eval()
            outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
                            token_type_ids=token_type_ids, position_ids=position_ids)
            lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
            loss = [lm_loss, mc_loss]
            total_voc = self.vocab_size
            self.parent.assertListEqual(
                list(lm_logits.size()),
                [self.batch_size, self.n_choices, self.seq_length, total_voc])
            self.parent.assertListEqual(
                list(mc_logits.size()),
                [self.batch_size, self.n_choices])
            self.parent.assertListEqual(
                [list(l.size()) for l in loss],
                [[], []])
        def create_and_check_model_from_pretrained(self):
            cache_dir = "/tmp/pytorch_transformers_test/"
            for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
                model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
                shutil.rmtree(cache_dir)
                self.parent.assertIsNotNone(model)
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
            (config, input_ids, token_type_ids, position_ids,
                mc_labels, lm_labels, mc_token_ids) = config_and_inputs
            inputs_dict = {'input_ids': input_ids}
            return config, inputs_dict
        def run_common_tests(self, test_presents=False):
            config_and_inputs = self.prepare_config_and_inputs()
            self.create_and_check_base_model(*config_and_inputs)
            config_and_inputs = self.prepare_config_and_inputs()
            self.create_and_check_lm_head(*config_and_inputs)
            config_and_inputs = self.prepare_config_and_inputs()
            self.create_and_check_double_heads(*config_and_inputs)
            if test_presents:
                config_and_inputs = self.prepare_config_and_inputs()
                self.create_and_check_presents(*config_and_inputs)
        def run_slow_tests(self):
            self.create_and_check_model_from_pretrained()
 class ConfigTester(object):
@@ -275,179 +539,22 @@ class ConfigTester(object):
        self.create_and_test_config_to_json_file()
 class GPTModelTester(object):
    def __init__(self,
                    parent,
                    batch_size=13,
                    seq_length=7,
                    is_training=True,
                    use_position_ids=True,
                    use_token_type_ids=True,
                    use_labels=True,
                    vocab_size=99,
                    n_positions=33,
                    hidden_size=32,
                    num_hidden_layers=5,
                    num_attention_heads=4,
                    n_choices=3,
                    type_sequence_label_size=2,
                    initializer_range=0.02,
                    num_labels=3,
                    scope=None,
                    config_class=None,
                    base_model_class=None,
                    lm_head_model_class=None,
                    double_head_model_class=None,
                    ):
        self.parent = parent
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.is_training = is_training
        self.use_position_ids = use_position_ids
        self.use_token_type_ids = use_token_type_ids
        self.use_labels = use_labels
        self.vocab_size = vocab_size
        self.n_positions = n_positions
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.n_choices = n_choices
        self.type_sequence_label_size = type_sequence_label_size
        self.initializer_range = initializer_range
        self.num_labels = num_labels
        self.scope = scope
        self.config_class = config_class
        self.base_model_class = base_model_class
        self.lm_head_model_class = lm_head_model_class
        self.double_head_model_class = double_head_model_class
        self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class)
    def prepare_config_and_inputs(self):
        total_num_tokens = self.vocab_size
        input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
        position_ids = None
        if self.use_position_ids:
            position_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
        token_type_ids = None
        if self.use_token_type_ids:
            total_voc = self.vocab_size
            token_type_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
        mc_labels = None
        lm_labels = None
        mc_token_ids = None
        if self.use_labels:
            mc_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
            lm_labels = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
            mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
        config = self.config_class(
            vocab_size_or_config_json_file=self.vocab_size,
            n_positions=self.n_positions,
            n_embd=self.hidden_size,
            n_layer=self.num_hidden_layers,
            n_head=self.num_attention_heads,
            initializer_range=self.initializer_range)
        return (config, input_ids, token_type_ids, position_ids,
                mc_labels, lm_labels, mc_token_ids)
    def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
                            mc_labels, lm_labels, mc_token_ids):
        model = self.base_model_class(config)
        model.eval()
        outputs = model(input_ids, position_ids, token_type_ids)
        outputs = model(input_ids, position_ids)
        outputs = model(input_ids)
        hidden_state = outputs[0]
        self.parent.assertListEqual(
            list(hidden_state.size()),
            [self.batch_size, self.n_choices, self.seq_length, self.hidden_size])
-    def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
+def ids_tensor(shape, vocab_size, rng=None, name=None):
-                                    mc_labels, lm_labels, mc_token_ids):
+    """Creates a random int32 tensor of the shape within the vocab size."""
-        model = self.lm_head_model_class(config)
+    if rng is None:
-        model.eval()
+        rng = random.Random()
        outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
        loss, lm_logits = outputs[:2]
-        total_voc = self.vocab_size
+    total_dims = 1
-        self.parent.assertListEqual(
+    for dim in shape:
-            list(lm_logits.size()),
+        total_dims *= dim
            [self.batch_size, self.n_choices, self.seq_length, total_voc])
        self.parent.assertListEqual(
            list(loss.size()),
            [])
-    def create_and_check_presents(self, config, input_ids, token_type_ids, position_ids,
+    values = []
-                                    mc_labels, lm_labels, mc_token_ids):
+    for _ in range(total_dims):
-        for model_class in self.all_model_classes:
+        values.append(rng.randint(0, vocab_size - 1))
            model = model_class(config)
            model.eval()
            outputs = model(input_ids)
            presents = outputs[-1]
            self.parent.assertEqual(self.num_hidden_layers, len(presents))
            self.parent.assertListEqual(
                list(presents[0].size()),
                [2, self.batch_size * self.n_choices, self.num_attention_heads,
                    self.seq_length, self.hidden_size // self.num_attention_heads])
-    def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
+    return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
                                    mc_labels, lm_labels, mc_token_ids):
        model = self.double_head_model_class(config)
        model.eval()
        outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
                        token_type_ids=token_type_ids, position_ids=position_ids)
        lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
        loss = [lm_loss, mc_loss]
        total_voc = self.vocab_size
        self.parent.assertListEqual(
            list(lm_logits.size()),
            [self.batch_size, self.n_choices, self.seq_length, total_voc])
        self.parent.assertListEqual(
            list(mc_logits.size()),
            [self.batch_size, self.n_choices])
        self.parent.assertListEqual(
            [list(l.size()) for l in loss],
            [[], []])
    def create_and_check_model_from_pretrained(self):
        cache_dir = "/tmp/pytorch_transformers_test/"
        for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
            model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
            shutil.rmtree(cache_dir)
            self.parent.assertIsNotNone(model)
    def create_and_check_commons(self, config, input_ids, token_type_ids, position_ids,
                                    mc_labels, lm_labels, mc_token_ids):
        inputs_dict = {'input_ids': input_ids}
        create_and_check_commons(self, config, inputs_dict)
    def run_common_tests(self, test_presents=False):
        config_and_inputs = self.prepare_config_and_inputs()
        self.create_and_check_base_model(*config_and_inputs)
        config_and_inputs = self.prepare_config_and_inputs()
        self.create_and_check_lm_head(*config_and_inputs)
        config_and_inputs = self.prepare_config_and_inputs()
        self.create_and_check_double_heads(*config_and_inputs)
        if test_presents:
            config_and_inputs = self.prepare_config_and_inputs()
            self.create_and_check_presents(*config_and_inputs)
        config_and_inputs = self.prepare_config_and_inputs()
        self.create_and_check_commons(*config_and_inputs)
    def run_slow_tests(self):
        self.create_and_check_model_from_pretrained()
 class ModelUtilsTest(unittest.TestCase):
@@ -471,79 +578,6 @@ class ModelUtilsTest(unittest.TestCase):
            self.assertEqual(model.config.output_hidden_states, True)
            self.assertEqual(model.config, config)
    def test_resize_tokens_embeddings(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            config = BertConfig.from_pretrained(model_name)
            model = BertModel.from_pretrained(model_name)
            model_vocab_size = config.vocab_size
            # Retrieve the embeddings and clone theme
            cloned_embeddings = model.embeddings.word_embeddings.weight.clone()
            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
            model.resize_token_embeddings(model_vocab_size + 10)
            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
            # Check that it actually resizes the embeddings matrix
            self.assertEqual(model.embeddings.word_embeddings.weight.shape[0], cloned_embeddings.shape[0] + 10)
            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
            model.resize_token_embeddings(model_vocab_size)
            self.assertEqual(model.config.vocab_size, model_vocab_size)
            # Check that it actually resizes the embeddings matrix
            self.assertEqual(model.embeddings.word_embeddings.weight.shape[0], cloned_embeddings.shape[0])
            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
            models_equal = True
            for p1, p2 in zip(cloned_embeddings, model.embeddings.word_embeddings.weight):
                if p1.data.ne(p2.data).sum() > 0:
                    models_equal = False
            self.assertTrue(models_equal)
    def test_tie_model_weights(self):
        logging.basicConfig(level=logging.INFO)
        def check_same_values(layer_1, layer_2):
            equal = True
            for p1, p2 in zip(layer_1.weight, layer_2.weight):
                if p1.data.ne(p2.data).sum() > 0:
                    equal = False
            return equal
        for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            config = GPT2Config.from_pretrained(model_name)
            model = GPT2LMHeadModel.from_pretrained(model_name)
            # Get the embeddings and decoding layer
            embeddings = model.transformer.wte
            decoding = model.lm_head
            # Check that the embedding layer and decoding layer are the same in size and in value
            self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
            self.assertTrue(check_same_values(embeddings, decoding))
            # Check that after modification, they remain the same.
            embeddings.weight.data.div_(2)
            # Check that the embedding layer and decoding layer are the same in size and in value
            self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
            self.assertTrue(check_same_values(embeddings, decoding))
            # Check that after modification, they remain the same.
            decoding.weight.data.div_(4)
            # Check that the embedding layer and decoding layer are the same in size and in value
            self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
            self.assertTrue(check_same_values(embeddings, decoding))
            # Check that after resize they remain tied.
            model.resize_token_embeddings(config.vocab_size + 10)
            decoding.weight.data.mul_(20)
            # Check that the embedding layer and decoding layer are the same in size and in value
            self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
            self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
 if __name__ == "__main__":
    unittest.main()
--- a/pytorch_transformers/tests/modeling_gpt2_test.py
+++ b/pytorch_transformers/tests/modeling_gpt2_test.py
@@ -16,19 +16,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import os
 import unittest
 import json
 import random
 import shutil
 import pytest
 import torch
 from pytorch_transformers import (GPT2Config, GPT2Model,
-                                     GPT2LMHeadModel, GPT2DoubleHeadsModel)
+                                  GPT2LMHeadModel, GPT2DoubleHeadsModel)
-from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester)
+from .modeling_common_test import CommonTestCases, ConfigTester
 class GPT2ModelTest(unittest.TestCase):
@@ -37,14 +32,14 @@ class GPT2ModelTest(unittest.TestCase):
        config_tester.run_common_tests()
    def test_model(self):
-        model_tester = GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
+        model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
                                            lm_head_model_class=GPT2LMHeadModel,
                                            double_head_model_class=GPT2DoubleHeadsModel)
        model_tester.run_common_tests(test_presents=True)
    @pytest.mark.slow
    def test_pretrained(self):
-        model_tester = GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
+        model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
                                            lm_head_model_class=GPT2LMHeadModel,
                                            double_head_model_class=GPT2DoubleHeadsModel)
        model_tester.run_slow_tests()
--- a/pytorch_transformers/tests/modeling_openai_test.py
+++ b/pytorch_transformers/tests/modeling_openai_test.py
@@ -19,12 +19,11 @@ from __future__ import print_function
 import unittest
 import pytest
 import torch
 from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
                                     OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
-from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester)
+from .modeling_common_test import CommonTestCases, ConfigTester
 class OpenAIModelTest(unittest.TestCase):
@@ -33,14 +32,14 @@ class OpenAIModelTest(unittest.TestCase):
        config_tester.run_common_tests()
    def test_model(self):
-        model_tester = GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
+        model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
                                           lm_head_model_class=OpenAIGPTLMHeadModel,
                                           double_head_model_class=OpenAIGPTDoubleHeadsModel)
        model_tester.run_common_tests(test_presents=False)
    @pytest.mark.slow
    def test_pretrained(self):
-        model_tester = GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
+        model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
                                           lm_head_model_class=OpenAIGPTLMHeadModel,
                                           double_head_model_class=OpenAIGPTDoubleHeadsModel)
        model_tester.run_slow_tests()
--- a/pytorch_transformers/tests/modeling_transfo_xl_test.py
+++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py
@@ -28,9 +28,15 @@ import torch
 from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
 from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor
+from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor
 class TransfoXLModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel)
    test_pruning = False
    test_torchscript = False
    test_resize_embeddings = False
 class TransfoXLModelTest(unittest.TestCase):
    class TransfoXLModelTester(object):
        def __init__(self,
@@ -52,7 +58,6 @@ class TransfoXLModelTest(unittest.TestCase):
                     num_hidden_layers=5,
                     scope=None,
                     seed=1,
                     all_model_classes=(TransfoXLModel, TransfoXLLMHeadModel),
                     ):
            self.parent = parent
            self.batch_size = batch_size
@@ -73,7 +78,6 @@ class TransfoXLModelTest(unittest.TestCase):
            self.num_hidden_layers = num_hidden_layers
            self.scope = scope
            self.seed = seed
            self.all_model_classes = all_model_classes
        def prepare_config_and_inputs(self):
            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -171,16 +175,31 @@ class TransfoXLModelTest(unittest.TestCase):
                list(list(mem.size()) for mem in result["mems_2"]),
                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
-        def create_and_check_transfo_xl_commons(self, config, input_ids_1, input_ids_2, lm_labels):
+        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
            (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
            inputs_dict = {'input_ids': input_ids_1}
-            create_and_check_commons(self, config, inputs_dict, test_pruning=False, test_torchscript=False)
+            return config, inputs_dict
-    def test_default(self):
+
-        self.run_tester(TransfoXLModelTest.TransfoXLModelTester(self))
+    def setUp(self):
        self.model_tester = TransfoXLModelTest.TransfoXLModelTester(self)
        self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
    def test_config(self):
-        config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
+        self.config_tester.run_common_tests()
-        config_tester.run_common_tests()
+
    def test_transfo_xl_model(self):
        self.model_tester.set_seed()
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        output_result = self.model_tester.create_transfo_xl_model(*config_and_inputs)
        self.model_tester.check_transfo_xl_model_output(output_result)
    def test_transfo_xl_lm_head(self):
        self.model_tester.set_seed()
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        output_result = self.model_tester.create_transfo_xl_lm_head(*config_and_inputs)
        self.model_tester.check_transfo_xl_lm_head_output(output_result)
    @pytest.mark.slow
    def test_model_from_pretrained(self):
@@ -190,23 +209,6 @@ class TransfoXLModelTest(unittest.TestCase):
            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
    def run_tester(self, tester):
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.set_seed()
        config_and_inputs = tester.prepare_config_and_inputs()
        output_result = tester.create_transfo_xl_model(*config_and_inputs)
        tester.check_transfo_xl_model_output(output_result)
        tester.set_seed()
        config_and_inputs = tester.prepare_config_and_inputs()
        output_result = tester.create_transfo_xl_lm_head(*config_and_inputs)
        tester.check_transfo_xl_lm_head_output(output_result)
        tester.set_seed()
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_transfo_xl_commons(*config_and_inputs)
 if __name__ == "__main__":
    unittest.main()
--- a/pytorch_transformers/tests/modeling_xlm_test.py
+++ b/pytorch_transformers/tests/modeling_xlm_test.py
@@ -23,10 +23,15 @@ import pytest
 from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
 from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor)
+from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
-class XLMModelTest(unittest.TestCase):
+class XLMModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (XLMModel, XLMWithLMHeadModel,  
                         XLMForQuestionAnswering, XLMForSequenceClassification) 
                         # , XLMForSequenceClassification, XLMForTokenClassification),
    class XLMModelTester(object):
        def __init__(self,
@@ -58,8 +63,6 @@ class XLMModelTest(unittest.TestCase):
                     summary_type="last",
                     use_proj=True,
                     scope=None,
                     all_model_classes = (XLMModel, XLMWithLMHeadModel,
                                          XLMForQuestionAnswering, XLMForSequenceClassification),  # , XLMForSequenceClassification, XLMForTokenClassification),
                    ):
            self.parent = parent
            self.batch_size = batch_size
@@ -90,7 +93,6 @@ class XLMModelTest(unittest.TestCase):
            self.num_labels = num_labels
            self.num_choices = num_choices
            self.scope = scope
            self.all_model_classes = all_model_classes
        def prepare_config_and_inputs(self):
            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -237,28 +239,23 @@ class XLMModelTest(unittest.TestCase):
                [self.batch_size, self.type_sequence_label_size])
-        def create_and_check_xlm_commons(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
            (config, input_ids, token_type_ids, input_lengths,
             sequence_labels, token_labels, is_impossible_labels, input_mask) = config_and_inputs
            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'lengths': input_lengths}
-            create_and_check_commons(self, config, inputs_dict)
+            return config, inputs_dict
-    def test_default(self):
+    def setUp(self):
-        self.run_tester(XLMModelTest.XLMModelTester(self))
+        self.model_tester = XLMModelTest.XLMModelTester(self)
        self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
    def test_config(self):
-        config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
+        self.config_tester.run_common_tests()
        config_tester.run_common_tests()
-    @pytest.mark.slow
+    def test_xlm_model(self):
-    def test_model_from_pretrained(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        self.model_tester.create_and_check_xlm_model(*config_and_inputs)
        for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
    def run_tester(self, tester):
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_xlm_model(*config_and_inputs)
        # config_and_inputs = tester.prepare_config_and_inputs()
        # tester.create_and_check_xlm_for_masked_lm(*config_and_inputs)
@@ -275,8 +272,14 @@ class XLMModelTest(unittest.TestCase):
        # config_and_inputs = tester.prepare_config_and_inputs()
        # tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
-        config_and_inputs = tester.prepare_config_and_inputs()
+    @pytest.mark.slow
-        tester.create_and_check_xlm_commons(*config_and_inputs)
+    def test_model_from_pretrained(self):
        cache_dir = "/tmp/pytorch_transformers_test/"
        for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
 if __name__ == "__main__":
    unittest.main()
--- a/pytorch_transformers/tests/modeling_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -28,9 +28,14 @@ import torch
 from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
 from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor
+from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor
 class XLNetModelTest(CommonTestCases.CommonModelTester):
    all_model_classes=(XLNetModel, XLNetLMHeadModel,
                    XLNetForSequenceClassification, XLNetForQuestionAnswering)
    test_pruning = False
 class XLNetModelTest(unittest.TestCase):
    class XLNetModelTester(object):
        def __init__(self,
@@ -56,8 +61,6 @@ class XLNetModelTest(unittest.TestCase):
                     initializer_range=0.05,
                     seed=1,
                     type_vocab_size=2,
                     all_model_classes=(XLNetModel, XLNetLMHeadModel,
                                        XLNetForSequenceClassification, XLNetForQuestionAnswering),
            ):
            self.parent = parent
            self.batch_size = batch_size
@@ -82,7 +85,6 @@ class XLNetModelTest(unittest.TestCase):
            self.seed = seed
            self.type_vocab_size = type_vocab_size
            self.type_sequence_label_size = type_sequence_label_size
            self.all_model_classes = all_model_classes
        def prepare_config_and_inputs(self):
            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -264,17 +266,41 @@ class XLNetModelTest(unittest.TestCase):
                list(list(mem.size()) for mem in result["mems_1"]),
                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
-        def create_and_check_xlnet_commons(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+        def prepare_config_and_inputs_for_common(self):
-                target_mapping, inp_q, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+            config_and_inputs = self.prepare_config_and_inputs()
            (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
                target_mapping, inp_q, segment_ids, lm_labels,
                sequence_labels, is_impossible_labels) = config_and_inputs
            inputs_dict = {'input_ids': input_ids_1}
-            create_and_check_commons(self, config, inputs_dict, test_pruning=False)
+            return config, inputs_dict
-    def test_default(self):
+
-        self.run_tester(XLNetModelTest.XLNetModelTester(self))
+    def setUp(self):
        self.model_tester = XLNetModelTest.XLNetModelTester(self)
        self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
    def test_config(self):
-        config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
+        self.config_tester.run_common_tests()
-        config_tester.run_common_tests()
+
    def test_xlnet_base_model(self):
        self.model_tester.set_seed()
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_xlnet_base_model(*config_and_inputs)
    def test_xlnet_lm_head(self):
        self.model_tester.set_seed()
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs) 
    def test_xlnet_sequence_classif(self):
        self.model_tester.set_seed()
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
    def test_xlnet_qa(self):
        self.model_tester.set_seed()
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
    @pytest.mark.slow
    def test_model_from_pretrained(self):
@@ -284,27 +310,6 @@ class XLNetModelTest(unittest.TestCase):
            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
    def run_tester(self, tester):
        tester.set_seed()
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_xlnet_base_model(*config_and_inputs)
        tester.set_seed()
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_xlnet_lm_head(*config_and_inputs) 
        tester.set_seed()
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
        tester.set_seed()
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_xlnet_qa(*config_and_inputs)
        tester.set_seed()
        config_and_inputs = tester.prepare_config_and_inputs()
        tester.create_and_check_xlnet_commons(*config_and_inputs)
 if __name__ == "__main__":
    unittest.main()