From 2918b7d2a09d7253b338c004258866da41cd6642 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 12 Jul 2019 10:57:58 +0200 Subject: [PATCH] updating tests --- pytorch_transformers/modeling_bert.py | 15 +- pytorch_transformers/modeling_gpt2.py | 15 +- pytorch_transformers/modeling_openai.py | 15 +- pytorch_transformers/modeling_transfo_xl.py | 19 +- pytorch_transformers/modeling_utils.py | 47 +- pytorch_transformers/modeling_xlm.py | 10 +- pytorch_transformers/modeling_xlnet.py | 13 +- .../tests/modeling_bert_test.py | 88 +- .../tests/modeling_common_test.py | 848 +++++++++--------- .../tests/modeling_gpt2_test.py | 13 +- .../tests/modeling_openai_test.py | 7 +- .../tests/modeling_transfo_xl_test.py | 56 +- .../tests/modeling_xlm_test.py | 51 +- .../tests/modeling_xlnet_test.py | 71 +- 14 files changed, 672 insertions(+), 596 deletions(-) diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py index d88c57bb79..8c75925a07 100644 --- a/pytorch_transformers/modeling_bert.py +++ b/pytorch_transformers/modeling_bert.py @@ -617,6 +617,7 @@ class BertModel(BertPreTrainedModel): old_embeddings = self.embeddings.word_embeddings new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens) self.embeddings.word_embeddings = new_embeddings + return self.embeddings.word_embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. @@ -758,11 +759,8 @@ class BertForPreTraining(BertPreTrainedModel): """ Make sure we are sharing the input and output embeddings. Export to TorchScript can't handle parameter sharing so we are cloning them instead. """ - input_embeddings = self.bert.embeddings.word_embeddings.weight - if self.config.torchscript: - self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone()) - else: - self.cls.predictions.decoder.weight = input_embeddings # Tied weights + self._tie_or_clone_weights(self.cls.predictions.decoder, + self.bert.embeddings.word_embeddings) def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None, head_mask=None): @@ -864,11 +862,8 @@ class BertForMaskedLM(BertPreTrainedModel): """ Make sure we are sharing the input and output embeddings. Export to TorchScript can't handle parameter sharing so we are cloning them instead. """ - input_embeddings = self.bert.embeddings.word_embeddings.weight - if self.config.torchscript: - self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone()) - else: - self.cls.predictions.decoder.weight = input_embeddings # Tied weights + self._tie_or_clone_weights(self.cls.predictions.decoder, + self.bert.embeddings.word_embeddings) def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None): """ diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py index 06f933147f..b5fc6fc49b 100644 --- a/pytorch_transformers/modeling_gpt2.py +++ b/pytorch_transformers/modeling_gpt2.py @@ -414,6 +414,7 @@ class GPT2Model(GPT2PreTrainedModel): def _resize_token_embeddings(self, new_num_tokens): self.wte = self._get_resized_embeddings(self.wte, new_num_tokens) + return self.wte def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. @@ -562,11 +563,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): """ Make sure we are sharing the input and output embeddings. Export to TorchScript can't handle parameter sharing so we are cloning them instead. """ - input_embeddings = self.transformer.wte.weight - if self.config.torchscript: - self.lm_head.weight = nn.Parameter(input_embeddings.clone()) - else: - self.lm_head.weight = input_embeddings # Tied weights + self._tie_or_clone_weights(self.lm_head, + self.transformer.wte) def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None): """ @@ -658,11 +656,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): """ Make sure we are sharing the input and output embeddings. Export to TorchScript can't handle parameter sharing so we are cloning them instead. """ - input_embeddings = self.transformer.wte.weight - if self.config.torchscript: - self.lm_head.weight = nn.Parameter(input_embeddings.clone()) - else: - self.lm_head.weight = input_embeddings # Tied weights + self._tie_or_clone_weights(self.lm_head, + self.transformer.wte) def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None, past=None, head_mask=None): diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py index ebf1035d21..9fb4720e93 100644 --- a/pytorch_transformers/modeling_openai.py +++ b/pytorch_transformers/modeling_openai.py @@ -430,6 +430,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): def _resize_token_embeddings(self, new_num_tokens): self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens) + return self.tokens_embed def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. @@ -583,11 +584,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): """ Make sure we are sharing the input and output embeddings. Export to TorchScript can't handle parameter sharing so we are cloning them instead. """ - input_embeddings = self.transformer.tokens_embed.weight - if self.config.torchscript: - self.lm_head.weight = nn.Parameter(input_embeddings.clone()) - else: - self.lm_head.weight = input_embeddings # Tied weights + self._tie_or_clone_weights(self.lm_head, + self.transformer.tokens_embed) def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None): """ @@ -696,11 +694,8 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): """ Make sure we are sharing the input and output embeddings. Export to TorchScript can't handle parameter sharing so we are cloning them instead. """ - input_embeddings = self.transformer.tokens_embed.weight - if self.config.torchscript: - self.lm_head.weight = nn.Parameter(input_embeddings.clone()) - else: - self.lm_head.weight = input_embeddings # Tied weights + self._tie_or_clone_weights(self.lm_head, + self.transformer.tokens_embed) def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None, head_mask=None): diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py index 71f80a9eea..b31723168a 100644 --- a/pytorch_transformers/modeling_transfo_xl.py +++ b/pytorch_transformers/modeling_transfo_xl.py @@ -291,6 +291,10 @@ class TransfoXLConfig(PretrainedConfig): def vocab_size(self): return self.n_token + @vocab_size.setter + def vocab_size(self, value): + self.n_token = value + @property def hidden_size(self): return self.d_model @@ -1003,7 +1007,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel): self.apply(self.init_weights) def _resize_token_embeddings(self, new_num_tokens): - raise NotImplementedError + return self.word_emb def backward_compatible(self): self.sample_softmax = -1 @@ -1280,13 +1284,20 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): else: if self.config.tie_weight: for i in range(len(self.crit.out_layers)): - self.crit.out_layers[i].weight = self.transformer.word_emb.emb_layers[i].weight + self._tie_or_clone_weights(self.crit.out_layers[i], + self.transformer.word_emb.emb_layers[i]) if self.config.tie_projs: for i, tie_proj in enumerate(self.config.tie_projs): if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed: - self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0] + if self.config.torchscript: + self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[0].clone()) + else: + self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0] elif tie_proj and self.config.div_val != 1: - self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i] + if self.config.torchscript: + self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[i].clone()) + else: + self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i] def reset_length(self, tgt_len, ext_len, mem_len): self.transformer.reset_length(tgt_len, ext_len, mem_len) diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py index 8fdfda4720..9ca3a3d090 100644 --- a/pytorch_transformers/modeling_utils.py +++ b/pytorch_transformers/modeling_utils.py @@ -165,9 +165,27 @@ class PreTrainedModel(nn.Module): # Save config in model self.config = config - def _get_resized_embeddings(self, old_embeddings, new_num_tokens): - # Build new embeddings + def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None): + """ Build a resized Embedding Module from a provided token Embedding Module. + Increasing the size will add newly initialized vectors at the end + Reducing the size will remove vectors from the end + + Args: + new_num_tokens: (Optional) New number of tokens in the embedding matrix. + Increasing the size will add newly initialized vectors at the end + Reducing the size will remove vectors from the end + If not provided or None: return the provided token Embedding Module. + Return: + Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None + """ + if new_num_tokens is None: + return old_embeddings + old_num_tokens, old_embedding_dim = old_embeddings.weight.size() + if old_num_tokens == new_num_tokens: + return old_embeddings + + # Build new embeddings new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim) new_embeddings.to(old_embeddings.weight.device) @@ -180,18 +198,29 @@ class PreTrainedModel(nn.Module): return new_embeddings - def resize_token_embeddings(self, new_num_tokens): - """ Resize input token embeddings matrix. + def _tie_or_clone_weights(self, first_module, second_module): + """ Tie or clone module weights depending of weither we are using TorchScript or not + """ + if self.config.torchscript: + first_module.weight = nn.Parameter(second_module.weight.clone()) + else: + first_module.weight = second_module.weight + + def resize_token_embeddings(self, new_num_tokens=None): + """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. Args: - new_num_tokens: New number of tokens in the embedding matrix. + new_num_tokens: (Optional) New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end Reducing the size will remove vectors from the end + If not provided or None: does nothing. + Return: + Pointer to the input tokens Embedding Module of the model """ - if new_num_tokens == self.config.vocab_size: - return base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed - base_model._resize_token_embeddings(new_num_tokens) + model_embeds = base_model._resize_token_embeddings(new_num_tokens) + if new_num_tokens is None: + return model_embeds # Update base model and current model config self.config.vocab_size = new_num_tokens @@ -201,6 +230,8 @@ class PreTrainedModel(nn.Module): if hasattr(self, 'tie_weights'): self.tie_weights() + return model_embeds + def prune_heads(self, heads_to_prune): """ Prunes heads of the base model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py index 3d5b35fae6..755e504b7d 100644 --- a/pytorch_transformers/modeling_xlm.py +++ b/pytorch_transformers/modeling_xlm.py @@ -184,6 +184,10 @@ class XLMConfig(PretrainedConfig): def vocab_size(self): return self.n_words + @vocab_size.setter + def vocab_size(self, value): + self.n_words = value + @property def hidden_size(self): return self.emb_dim @@ -479,6 +483,7 @@ class XLMModel(XLMPreTrainedModel): def _resize_token_embeddings(self, new_num_tokens): self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens) + return self.embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. @@ -728,10 +733,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): def tie_weights(self): """ Make sure we are sharing the embeddings """ - if self.config.torchscript: - self.pred_layer.proj.weight = nn.Parameter(self.transformer.embeddings.weight.clone()) - else: - self.pred_layer.proj.weight = self.transformer.embeddings.weight + self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings) def forward(self, input_ids, lengths=None, positions=None, langs=None, token_type_ids=None, attention_mask=None, cache=None, labels=None, head_mask=None): diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py index 36c068e3a3..051cc4e112 100644 --- a/pytorch_transformers/modeling_xlnet.py +++ b/pytorch_transformers/modeling_xlnet.py @@ -316,6 +316,10 @@ class XLNetConfig(PretrainedConfig): def vocab_size(self): return self.n_token + @vocab_size.setter + def vocab_size(self, value): + self.n_token = value + @property def hidden_size(self): return self.d_model @@ -660,10 +664,10 @@ class XLNetModel(XLNetPreTrainedModel): def _resize_token_embeddings(self, new_num_tokens): self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens) + return self.word_embedding def _prune_heads(self, heads_to_prune): - logger.info("Head pruning is not implemented for XLNet") - pass + raise NotImplementedError def create_mask(self, qlen, mlen): """ @@ -987,10 +991,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): def tie_weights(self): """ Make sure we are sharing the embeddings """ - if self.config.torchscript: - self.lm_loss.weight = nn.Parameter(self.transformer.word_embedding.weight.clone()) - else: - self.lm_loss.weight = self.transformer.word_embedding.weight + self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding) def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, inp_q=None, diff --git a/pytorch_transformers/tests/modeling_bert_test.py b/pytorch_transformers/tests/modeling_bert_test.py index 4ab0c9d157..ac5d2636a9 100644 --- a/pytorch_transformers/tests/modeling_bert_test.py +++ b/pytorch_transformers/tests/modeling_bert_test.py @@ -26,10 +26,15 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM, BertForTokenClassification, BertForMultipleChoice) from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor) +from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor) -class BertModelTest(unittest.TestCase): +class BertModelTest(CommonTestCases.CommonModelTester): + + all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction, + BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, + BertForTokenClassification) + class BertModelTester(object): def __init__(self, @@ -55,9 +60,6 @@ class BertModelTest(unittest.TestCase): num_labels=3, num_choices=4, scope=None, - all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction, - BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, - BertForTokenClassification), ): self.parent = parent self.batch_size = batch_size @@ -81,7 +83,6 @@ class BertModelTest(unittest.TestCase): self.num_labels = num_labels self.num_choices = num_choices self.scope = scope - self.all_model_classes = all_model_classes def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) @@ -253,16 +254,51 @@ class BertModelTest(unittest.TestCase): self.check_loss_output(result) - def create_and_check_bert_commons(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + (config, input_ids, token_type_ids, input_mask, + sequence_labels, token_labels, choice_labels) = config_and_inputs inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask} - create_and_check_commons(self, config, inputs_dict) + return config, inputs_dict - def test_default(self): - self.run_tester(BertModelTest.BertModelTester(self)) + def setUp(self): + self.model_tester = BertModelTest.BertModelTester(self) + self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37) def test_config(self): - config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37) - config_tester.run_common_tests() + self.config_tester.run_common_tests() + + def test_bert_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_bert_model(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs) + + def test_for_next_sequence_prediction(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs) + + def test_for_pretraining(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_bert_for_pretraining(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_bert_for_question_answering(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_bert_for_sequence_classification(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs) @pytest.mark.slow def test_model_from_pretrained(self): @@ -272,33 +308,5 @@ class BertModelTest(unittest.TestCase): shutil.rmtree(cache_dir) self.assertIsNotNone(model) - def run_tester(self, tester): - config_and_inputs = tester.prepare_config_and_inputs() - tester.create_and_check_bert_model(*config_and_inputs) - - config_and_inputs = tester.prepare_config_and_inputs() - tester.create_and_check_bert_for_masked_lm(*config_and_inputs) - - config_and_inputs = tester.prepare_config_and_inputs() - tester.create_and_check_bert_for_multiple_choice(*config_and_inputs) - - config_and_inputs = tester.prepare_config_and_inputs() - tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs) - - config_and_inputs = tester.prepare_config_and_inputs() - tester.create_and_check_bert_for_pretraining(*config_and_inputs) - - config_and_inputs = tester.prepare_config_and_inputs() - tester.create_and_check_bert_for_question_answering(*config_and_inputs) - - config_and_inputs = tester.prepare_config_and_inputs() - tester.create_and_check_bert_for_sequence_classification(*config_and_inputs) - - config_and_inputs = tester.prepare_config_and_inputs() - tester.create_and_check_bert_for_token_classification(*config_and_inputs) - - config_and_inputs = tester.prepare_config_and_inputs() - tester.create_and_check_bert_commons(*config_and_inputs) - if __name__ == "__main__": unittest.main() diff --git a/pytorch_transformers/tests/modeling_common_test.py b/pytorch_transformers/tests/modeling_common_test.py index 557d8aede0..5ea98d68e2 100644 --- a/pytorch_transformers/tests/modeling_common_test.py +++ b/pytorch_transformers/tests/modeling_common_test.py @@ -39,207 +39,471 @@ def _config_zero_init(config): setattr(configs_no_init, key, 0.0) return configs_no_init -def _create_and_check_torchscript_output_attentions(tester, model_classes, config, inputs_dict): - config.output_attentions = True - _create_and_check_torchscript(tester, model_classes, config, inputs_dict) +class CommonTestCases: -def _create_and_check_torchscript_output_hidden_state(tester, model_classes, config, inputs_dict): - config.output_hidden_states = True - _create_and_check_torchscript(tester, model_classes, config, inputs_dict) + class CommonModelTester(unittest.TestCase): -def _create_and_check_torchscript(tester, model_classes, config, inputs_dict): - configs_no_init = _config_zero_init(config) # To be sure we have no Nan - configs_no_init.torchscript = True - for model_class in model_classes: - model = model_class(config=configs_no_init) - model.eval() - inputs = inputs_dict['input_ids'] # Let's keep only input_ids + model_tester = None + all_model_classes = () + test_torchscript = True + test_pruning = True + test_resize_embeddings = True - try: - torch.jit.trace(model, inputs) - except RuntimeError: - tester.parent.fail("Couldn't trace module.") + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - try: - traced_gpt2 = torch.jit.trace(model, inputs) - torch.jit.save(traced_gpt2, "traced_model.pt") - except RuntimeError: - tester.parent.fail("Couldn't save module.") + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + self.assertIn(param.data.mean().item(), [0.0, 1.0], + msg="Parameter {} of model {} seems not properly initialized".format(name, model_class)) - try: - loaded_model = torch.jit.load("traced_model.pt") - os.remove("traced_model.pt") - except ValueError: - tester.parent.fail("Couldn't load module.") + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model.eval() - loaded_model.eval() + for model_class in self.all_model_classes: + config.output_attentions = True + config.output_hidden_states = False + model = model_class(config) + model.eval() + outputs = model(**inputs_dict) + attentions = outputs[-1] + self.assertEqual(model.config.output_attentions, True) + self.assertEqual(model.config.output_hidden_states, False) + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, + self.model_tester.seq_length, + self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) + out_len = len(outputs) - model_params = model.parameters() - loaded_model_params = loaded_model.parameters() + # Check attention is always last and order is fine + config.output_attentions = True + config.output_hidden_states = True + model = model_class(config) + model.eval() + outputs = model(**inputs_dict) + self.assertEqual(out_len+1, len(outputs)) + self.assertEqual(model.config.output_attentions, True) + self.assertEqual(model.config.output_hidden_states, True) - models_equal = True - for p1, p2 in zip(model_params, loaded_model_params): - if p1.data.ne(p2.data).sum() > 0: - models_equal = False + attentions = outputs[-1] + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, + self.model_tester.seq_length, + self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) - tester.parent.assertTrue(models_equal) + def test_torchscript(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() -def _create_and_check_initialization(tester, model_classes, config, inputs_dict): - configs_no_init = _config_zero_init(config) - for model_class in model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - tester.parent.assertIn(param.data.mean().item(), [0.0, 1.0], - msg="Parameter {} of model {} seems not properly initialized".format(name, model_class)) + self._create_and_check_torchscript(config, inputs_dict) -def _create_and_check_for_headmasking(tester, model_classes, config, inputs_dict): - configs_no_init = _config_zero_init(config) # To be sure we have no Nan - for model_class in model_classes: - config.output_attentions = True - config.output_hidden_states = True - model = model_class(config=configs_no_init) - model.eval() + def test_torchscript_output_attentions(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - # Prepare head_mask - # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) - head_mask = torch.ones(tester.num_hidden_layers, tester.num_attention_heads) - head_mask[0, 0] = 0 - head_mask[-1, :-1] = 0 - head_mask.requires_grad_(requires_grad=True) - inputs = inputs_dict.copy() - inputs['head_mask'] = head_mask + config.output_attentions = True + self._create_and_check_torchscript(config, inputs_dict) - outputs = model(**inputs) + def test_torchscript_output_hidden_state(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - # Test that we can get a gradient back for importance score computation - output = sum(t.sum() for t in outputs[0]) - output = output.sum() - output.backward() - multihead_outputs = head_mask.grad + config.output_hidden_states = True + self._create_and_check_torchscript(config, inputs_dict) - attentions = outputs[-1] - hidden_states = outputs[-2] + def _create_and_check_torchscript(self, config, inputs_dict): + if not self.test_torchscript: + return - # Remove Nan + configs_no_init = _config_zero_init(config) # To be sure we have no Nan + configs_no_init.torchscript = True + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + model.eval() + inputs = inputs_dict['input_ids'] # Let's keep only input_ids - tester.parent.assertIsNotNone(multihead_outputs) - tester.parent.assertEqual(len(multihead_outputs), tester.num_hidden_layers) - tester.parent.assertAlmostEqual( - attentions[0][..., 0, :, :].flatten().sum().item(), 0.0) - tester.parent.assertNotEqual( - attentions[0][..., -1, :, :].flatten().sum().item(), 0.0) - tester.parent.assertNotEqual( - attentions[1][..., 0, :, :].flatten().sum().item(), 0.0) - tester.parent.assertAlmostEqual( - attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0) - tester.parent.assertNotEqual( - attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0) + try: + torch.jit.trace(model, inputs) + except RuntimeError: + self.fail("Couldn't trace module.") + + try: + traced_gpt2 = torch.jit.trace(model, inputs) + torch.jit.save(traced_gpt2, "traced_model.pt") + except RuntimeError: + self.fail("Couldn't save module.") + + try: + loaded_model = torch.jit.load("traced_model.pt") + os.remove("traced_model.pt") + except ValueError: + self.fail("Couldn't load module.") + + model.eval() + loaded_model.eval() + + model_params = model.parameters() + loaded_model_params = loaded_model.parameters() + + models_equal = True + for p1, p2 in zip(model_params, loaded_model_params): + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) -def _create_and_check_for_head_pruning(tester, model_classes, config, inputs_dict): - for model_class in model_classes: - config.output_attentions = True - config.output_hidden_states = False - model = model_class(config=config) - model.eval() - heads_to_prune = {0: list(range(1, tester.num_attention_heads)), - -1: [0]} - model.prune_heads(heads_to_prune) - outputs = model(**inputs_dict) + def test_headmasking(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - attentions = outputs[-1] + config.output_attentions = True + config.output_hidden_states = True + configs_no_init = _config_zero_init(config) # To be sure we have no Nan + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + model.eval() - tester.parent.assertEqual( - attentions[0].shape[-3], 1) - tester.parent.assertEqual( - attentions[1].shape[-3], tester.num_attention_heads) - tester.parent.assertEqual( - attentions[-1].shape[-3], tester.num_attention_heads - 1) + # Prepare head_mask + # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) + head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads) + head_mask[0, 0] = 0 + head_mask[-1, :-1] = 0 + head_mask.requires_grad_(requires_grad=True) + inputs = inputs_dict.copy() + inputs['head_mask'] = head_mask + + outputs = model(**inputs) + + # Test that we can get a gradient back for importance score computation + output = sum(t.sum() for t in outputs[0]) + output = output.sum() + output.backward() + multihead_outputs = head_mask.grad + + attentions = outputs[-1] + hidden_states = outputs[-2] + + # Remove Nan + + self.assertIsNotNone(multihead_outputs) + self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers) + self.assertAlmostEqual( + attentions[0][..., 0, :, :].flatten().sum().item(), 0.0) + self.assertNotEqual( + attentions[0][..., -1, :, :].flatten().sum().item(), 0.0) + self.assertNotEqual( + attentions[1][..., 0, :, :].flatten().sum().item(), 0.0) + self.assertAlmostEqual( + attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0) + self.assertNotEqual( + attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0) -def _create_and_check_for_attentions(tester, model_classes, config, inputs_dict): - for model_class in model_classes: - config.output_attentions = True - config.output_hidden_states = False - model = model_class(config) - model.eval() - outputs = model(**inputs_dict) - attentions = outputs[-1] - tester.parent.assertEqual(model.config.output_attentions, True) - tester.parent.assertEqual(model.config.output_hidden_states, False) - tester.parent.assertEqual(len(attentions), tester.num_hidden_layers) - tester.parent.assertListEqual( - list(attentions[0].shape[-3:]), - [tester.num_attention_heads, - tester.seq_length, - tester.key_len if hasattr(tester, 'key_len') else tester.seq_length]) - out_len = len(outputs) + def test_head_pruning(self): + if not self.test_pruning: + return - # Check attention is always last and order is fine - config.output_attentions = True - config.output_hidden_states = True - model = model_class(config) - model.eval() - outputs = model(**inputs_dict) - tester.parent.assertEqual(out_len+1, len(outputs)) - tester.parent.assertEqual(model.config.output_attentions, True) - tester.parent.assertEqual(model.config.output_hidden_states, True) + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - attentions = outputs[-1] - tester.parent.assertEqual(len(attentions), tester.num_hidden_layers) - tester.parent.assertListEqual( - list(attentions[0].shape[-3:]), - [tester.num_attention_heads, - tester.seq_length, - tester.key_len if hasattr(tester, 'key_len') else tester.seq_length]) + for model_class in self.all_model_classes: + config.output_attentions = True + config.output_hidden_states = False + model = model_class(config=config) + model.eval() + heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), + -1: [0]} + model.prune_heads(heads_to_prune) + outputs = model(**inputs_dict) -def _create_and_check_for_hidden_states(tester, model_classes, config, inputs_dict): - for model_class in model_classes: - config.output_hidden_states = True - config.output_attentions = False - model = model_class(config) - model.eval() - outputs = model(**inputs_dict) - hidden_states = outputs[-1] - tester.parent.assertEqual(model.config.output_attentions, False) - tester.parent.assertEqual(model.config.output_hidden_states, True) - tester.parent.assertEqual(len(hidden_states), tester.num_hidden_layers + 1) - tester.parent.assertListEqual( - list(hidden_states[0].shape[-2:]), - [tester.seq_length, tester.hidden_size]) + attentions = outputs[-1] + + self.assertEqual( + attentions[0].shape[-3], 1) + self.assertEqual( + attentions[1].shape[-3], self.model_tester.num_attention_heads) + self.assertEqual( + attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) -def create_and_check_commons(tester, config, inputs_dict, test_pruning=True, test_torchscript=True): - _create_and_check_initialization(tester, tester.all_model_classes, config, inputs_dict) - _create_and_check_for_attentions(tester, tester.all_model_classes, config, inputs_dict) - _create_and_check_for_headmasking(tester, tester.all_model_classes, config, inputs_dict) - _create_and_check_for_hidden_states(tester, tester.all_model_classes, config, inputs_dict) + def test_hidden_states_output(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - if test_torchscript: - _create_and_check_torchscript(tester, tester.all_model_classes, config, inputs_dict) - _create_and_check_torchscript_output_attentions(tester, tester.all_model_classes, config, inputs_dict) - _create_and_check_torchscript_output_hidden_state(tester, tester.all_model_classes, config, inputs_dict) + for model_class in self.all_model_classes: + config.output_hidden_states = True + config.output_attentions = False + model = model_class(config) + model.eval() + outputs = model(**inputs_dict) + hidden_states = outputs[-1] + self.assertEqual(model.config.output_attentions, False) + self.assertEqual(model.config.output_hidden_states, True) + self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1) + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [self.model_tester.seq_length, self.model_tester.hidden_size]) - if test_pruning: - _create_and_check_for_head_pruning(tester, tester.all_model_classes, config, inputs_dict) + def test_resize_tokens_embeddings(self): + original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + if not self.test_resize_embeddings: + return + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = model_class(config) + + model_vocab_size = config.vocab_size + # Retrieve the embeddings and clone theme + model_embed = model.resize_token_embeddings(model_vocab_size) + cloned_embeddings = model_embed.weight.clone() + + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size + 10) + self.assertEqual(model.config.vocab_size, model_vocab_size + 10) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) + + # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size - 15) + self.assertEqual(model.config.vocab_size, model_vocab_size - 15) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15) + + # Check that adding and removing tokens has not modified the first part of the embedding matrix. + models_equal = True + for p1, p2 in zip(cloned_embeddings, model_embed.weight): + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + def test_tie_model_weights(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def check_same_values(layer_1, layer_2): + equal = True + for p1, p2 in zip(layer_1.weight, layer_2.weight): + if p1.data.ne(p2.data).sum() > 0: + equal = False + return equal + + for model_class in self.all_model_classes: + if not hasattr(model_class, 'tie_weights'): + continue + + config.torchscript = True + model_not_tied = model_class(config) + params_not_tied = list(model_not_tied.parameters()) + + config_tied = copy.deepcopy(config) + config_tied.torchscript = False + model_tied = model_class(config_tied) + params_tied = list(model_tied.parameters()) + + # Check that the embedding layer and decoding layer are the same in size and in value + self.assertGreater(len(params_not_tied), len(params_tied)) + # self.assertTrue(check_same_values(embeddings, decoding)) + + # # Check that after modification, they remain the same. + # embeddings.weight.data.div_(2) + # # Check that the embedding layer and decoding layer are the same in size and in value + # self.assertTrue(embeddings.weight.shape, decoding.weight.shape) + # self.assertTrue(check_same_values(embeddings, decoding)) + + # # Check that after modification, they remain the same. + # decoding.weight.data.div_(4) + # # Check that the embedding layer and decoding layer are the same in size and in value + # self.assertTrue(embeddings.weight.shape, decoding.weight.shape) + # self.assertTrue(check_same_values(embeddings, decoding)) + + # Check that after resize they remain tied. + model_tied.resize_token_embeddings(config.vocab_size + 10) + params_tied_2 = list(model_tied.parameters()) + self.assertGreater(len(params_not_tied), len(params_tied)) + self.assertEqual(len(params_tied_2), len(params_tied)) + + # decoding.weight.data.mul_(20) + # # Check that the embedding layer and decoding layer are the same in size and in value + # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape) + # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head)) -def ids_tensor(shape, vocab_size, rng=None, name=None): - """Creates a random int32 tensor of the shape within the vocab size.""" - if rng is None: - rng = random.Random() + class GPTModelTester(CommonModelTester): - total_dims = 1 - for dim in shape: - total_dims *= dim + def __init__(self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_position_ids=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + n_positions=33, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + n_choices=3, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + scope=None, + config_class=None, + base_model_class=None, + lm_head_model_class=None, + double_head_model_class=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_position_ids = use_position_ids + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.n_positions = n_positions + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.n_choices = n_choices + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.scope = scope + self.config_class = config_class + self.base_model_class = base_model_class + self.lm_head_model_class = lm_head_model_class + self.double_head_model_class = double_head_model_class + self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class) - values = [] - for _ in range(total_dims): - values.append(rng.randint(0, vocab_size - 1)) + def prepare_config_and_inputs(self): + total_num_tokens = self.vocab_size + input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens) - return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous() + position_ids = None + if self.use_position_ids: + position_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions) + + token_type_ids = None + if self.use_token_type_ids: + total_voc = self.vocab_size + token_type_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc) + + mc_labels = None + lm_labels = None + mc_token_ids = None + if self.use_labels: + mc_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + lm_labels = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels) + mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length) + + config = self.config_class( + vocab_size_or_config_json_file=self.vocab_size, + n_positions=self.n_positions, + n_embd=self.hidden_size, + n_layer=self.num_hidden_layers, + n_head=self.num_attention_heads, + initializer_range=self.initializer_range) + + return (config, input_ids, token_type_ids, position_ids, + mc_labels, lm_labels, mc_token_ids) + + def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids, + mc_labels, lm_labels, mc_token_ids): + model = self.base_model_class(config) + model.eval() + + outputs = model(input_ids, position_ids, token_type_ids) + outputs = model(input_ids, position_ids) + outputs = model(input_ids) + + hidden_state = outputs[0] + self.parent.assertListEqual( + list(hidden_state.size()), + [self.batch_size, self.n_choices, self.seq_length, self.hidden_size]) + + + def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids, + mc_labels, lm_labels, mc_token_ids): + model = self.lm_head_model_class(config) + model.eval() + outputs = model(input_ids, position_ids, token_type_ids, lm_labels) + loss, lm_logits = outputs[:2] + + total_voc = self.vocab_size + self.parent.assertListEqual( + list(lm_logits.size()), + [self.batch_size, self.n_choices, self.seq_length, total_voc]) + self.parent.assertListEqual( + list(loss.size()), + []) + + def create_and_check_presents(self, config, input_ids, token_type_ids, position_ids, + mc_labels, lm_labels, mc_token_ids): + for model_class in self.all_model_classes: + model = model_class(config) + model.eval() + outputs = model(input_ids) + presents = outputs[-1] + self.parent.assertEqual(self.num_hidden_layers, len(presents)) + self.parent.assertListEqual( + list(presents[0].size()), + [2, self.batch_size * self.n_choices, self.num_attention_heads, + self.seq_length, self.hidden_size // self.num_attention_heads]) + + def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids, + mc_labels, lm_labels, mc_token_ids): + model = self.double_head_model_class(config) + model.eval() + outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels, + token_type_ids=token_type_ids, position_ids=position_ids) + lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4] + loss = [lm_loss, mc_loss] + + total_voc = self.vocab_size + self.parent.assertListEqual( + list(lm_logits.size()), + [self.batch_size, self.n_choices, self.seq_length, total_voc]) + self.parent.assertListEqual( + list(mc_logits.size()), + [self.batch_size, self.n_choices]) + self.parent.assertListEqual( + [list(l.size()) for l in loss], + [[], []]) + + def create_and_check_model_from_pretrained(self): + cache_dir = "/tmp/pytorch_transformers_test/" + for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]: + model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir) + shutil.rmtree(cache_dir) + self.parent.assertIsNotNone(model) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + (config, input_ids, token_type_ids, position_ids, + mc_labels, lm_labels, mc_token_ids) = config_and_inputs + inputs_dict = {'input_ids': input_ids} + return config, inputs_dict + + def run_common_tests(self, test_presents=False): + config_and_inputs = self.prepare_config_and_inputs() + self.create_and_check_base_model(*config_and_inputs) + + config_and_inputs = self.prepare_config_and_inputs() + self.create_and_check_lm_head(*config_and_inputs) + + config_and_inputs = self.prepare_config_and_inputs() + self.create_and_check_double_heads(*config_and_inputs) + + if test_presents: + config_and_inputs = self.prepare_config_and_inputs() + self.create_and_check_presents(*config_and_inputs) + + def run_slow_tests(self): + self.create_and_check_model_from_pretrained() class ConfigTester(object): @@ -275,179 +539,22 @@ class ConfigTester(object): self.create_and_test_config_to_json_file() -class GPTModelTester(object): - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_position_ids=True, - use_token_type_ids=True, - use_labels=True, - vocab_size=99, - n_positions=33, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - n_choices=3, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - scope=None, - config_class=None, - base_model_class=None, - lm_head_model_class=None, - double_head_model_class=None, - ): - self.parent = parent - self.batch_size = batch_size - self.seq_length = seq_length - self.is_training = is_training - self.use_position_ids = use_position_ids - self.use_token_type_ids = use_token_type_ids - self.use_labels = use_labels - self.vocab_size = vocab_size - self.n_positions = n_positions - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.n_choices = n_choices - self.type_sequence_label_size = type_sequence_label_size - self.initializer_range = initializer_range - self.num_labels = num_labels - self.scope = scope - self.config_class = config_class - self.base_model_class = base_model_class - self.lm_head_model_class = lm_head_model_class - self.double_head_model_class = double_head_model_class - self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class) - - def prepare_config_and_inputs(self): - total_num_tokens = self.vocab_size - input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens) - - position_ids = None - if self.use_position_ids: - position_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions) - - token_type_ids = None - if self.use_token_type_ids: - total_voc = self.vocab_size - token_type_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc) - - mc_labels = None - lm_labels = None - mc_token_ids = None - if self.use_labels: - mc_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) - lm_labels = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels) - mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length) - - config = self.config_class( - vocab_size_or_config_json_file=self.vocab_size, - n_positions=self.n_positions, - n_embd=self.hidden_size, - n_layer=self.num_hidden_layers, - n_head=self.num_attention_heads, - initializer_range=self.initializer_range) - - return (config, input_ids, token_type_ids, position_ids, - mc_labels, lm_labels, mc_token_ids) - - def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids, - mc_labels, lm_labels, mc_token_ids): - model = self.base_model_class(config) - model.eval() - - outputs = model(input_ids, position_ids, token_type_ids) - outputs = model(input_ids, position_ids) - outputs = model(input_ids) - - hidden_state = outputs[0] - self.parent.assertListEqual( - list(hidden_state.size()), - [self.batch_size, self.n_choices, self.seq_length, self.hidden_size]) - def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids, - mc_labels, lm_labels, mc_token_ids): - model = self.lm_head_model_class(config) - model.eval() - outputs = model(input_ids, position_ids, token_type_ids, lm_labels) - loss, lm_logits = outputs[:2] +def ids_tensor(shape, vocab_size, rng=None, name=None): + """Creates a random int32 tensor of the shape within the vocab size.""" + if rng is None: + rng = random.Random() - total_voc = self.vocab_size - self.parent.assertListEqual( - list(lm_logits.size()), - [self.batch_size, self.n_choices, self.seq_length, total_voc]) - self.parent.assertListEqual( - list(loss.size()), - []) + total_dims = 1 + for dim in shape: + total_dims *= dim - def create_and_check_presents(self, config, input_ids, token_type_ids, position_ids, - mc_labels, lm_labels, mc_token_ids): - for model_class in self.all_model_classes: - model = model_class(config) - model.eval() - outputs = model(input_ids) - presents = outputs[-1] - self.parent.assertEqual(self.num_hidden_layers, len(presents)) - self.parent.assertListEqual( - list(presents[0].size()), - [2, self.batch_size * self.n_choices, self.num_attention_heads, - self.seq_length, self.hidden_size // self.num_attention_heads]) + values = [] + for _ in range(total_dims): + values.append(rng.randint(0, vocab_size - 1)) - def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids, - mc_labels, lm_labels, mc_token_ids): - model = self.double_head_model_class(config) - model.eval() - outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels, - token_type_ids=token_type_ids, position_ids=position_ids) - lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4] - loss = [lm_loss, mc_loss] - - total_voc = self.vocab_size - self.parent.assertListEqual( - list(lm_logits.size()), - [self.batch_size, self.n_choices, self.seq_length, total_voc]) - self.parent.assertListEqual( - list(mc_logits.size()), - [self.batch_size, self.n_choices]) - self.parent.assertListEqual( - [list(l.size()) for l in loss], - [[], []]) - - def create_and_check_model_from_pretrained(self): - cache_dir = "/tmp/pytorch_transformers_test/" - for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]: - model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) - self.parent.assertIsNotNone(model) - - def create_and_check_commons(self, config, input_ids, token_type_ids, position_ids, - mc_labels, lm_labels, mc_token_ids): - inputs_dict = {'input_ids': input_ids} - create_and_check_commons(self, config, inputs_dict) - - def run_common_tests(self, test_presents=False): - config_and_inputs = self.prepare_config_and_inputs() - self.create_and_check_base_model(*config_and_inputs) - - config_and_inputs = self.prepare_config_and_inputs() - self.create_and_check_lm_head(*config_and_inputs) - - config_and_inputs = self.prepare_config_and_inputs() - self.create_and_check_double_heads(*config_and_inputs) - - if test_presents: - config_and_inputs = self.prepare_config_and_inputs() - self.create_and_check_presents(*config_and_inputs) - - config_and_inputs = self.prepare_config_and_inputs() - self.create_and_check_commons(*config_and_inputs) - - def run_slow_tests(self): - self.create_and_check_model_from_pretrained() + return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous() class ModelUtilsTest(unittest.TestCase): @@ -471,79 +578,6 @@ class ModelUtilsTest(unittest.TestCase): self.assertEqual(model.config.output_hidden_states, True) self.assertEqual(model.config, config) - def test_resize_tokens_embeddings(self): - logging.basicConfig(level=logging.INFO) - - - for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - config = BertConfig.from_pretrained(model_name) - model = BertModel.from_pretrained(model_name) - - model_vocab_size = config.vocab_size - # Retrieve the embeddings and clone theme - cloned_embeddings = model.embeddings.word_embeddings.weight.clone() - - # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size - model.resize_token_embeddings(model_vocab_size + 10) - self.assertEqual(model.config.vocab_size, model_vocab_size + 10) - # Check that it actually resizes the embeddings matrix - self.assertEqual(model.embeddings.word_embeddings.weight.shape[0], cloned_embeddings.shape[0] + 10) - - # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size - model.resize_token_embeddings(model_vocab_size) - self.assertEqual(model.config.vocab_size, model_vocab_size) - # Check that it actually resizes the embeddings matrix - self.assertEqual(model.embeddings.word_embeddings.weight.shape[0], cloned_embeddings.shape[0]) - - # Check that adding and removing tokens has not modified the first part of the embedding matrix. - models_equal = True - for p1, p2 in zip(cloned_embeddings, model.embeddings.word_embeddings.weight): - if p1.data.ne(p2.data).sum() > 0: - models_equal = False - - self.assertTrue(models_equal) - - def test_tie_model_weights(self): - logging.basicConfig(level=logging.INFO) - - def check_same_values(layer_1, layer_2): - equal = True - for p1, p2 in zip(layer_1.weight, layer_2.weight): - if p1.data.ne(p2.data).sum() > 0: - equal = False - return equal - - for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - config = GPT2Config.from_pretrained(model_name) - model = GPT2LMHeadModel.from_pretrained(model_name) - - # Get the embeddings and decoding layer - embeddings = model.transformer.wte - decoding = model.lm_head - - # Check that the embedding layer and decoding layer are the same in size and in value - self.assertTrue(embeddings.weight.shape, decoding.weight.shape) - self.assertTrue(check_same_values(embeddings, decoding)) - - # Check that after modification, they remain the same. - embeddings.weight.data.div_(2) - # Check that the embedding layer and decoding layer are the same in size and in value - self.assertTrue(embeddings.weight.shape, decoding.weight.shape) - self.assertTrue(check_same_values(embeddings, decoding)) - - # Check that after modification, they remain the same. - decoding.weight.data.div_(4) - # Check that the embedding layer and decoding layer are the same in size and in value - self.assertTrue(embeddings.weight.shape, decoding.weight.shape) - self.assertTrue(check_same_values(embeddings, decoding)) - - # Check that after resize they remain tied. - model.resize_token_embeddings(config.vocab_size + 10) - decoding.weight.data.mul_(20) - # Check that the embedding layer and decoding layer are the same in size and in value - self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape) - self.assertTrue(check_same_values(model.transformer.wte, model.lm_head)) - if __name__ == "__main__": unittest.main() diff --git a/pytorch_transformers/tests/modeling_gpt2_test.py b/pytorch_transformers/tests/modeling_gpt2_test.py index 00a9cb4614..4e32cc37e1 100644 --- a/pytorch_transformers/tests/modeling_gpt2_test.py +++ b/pytorch_transformers/tests/modeling_gpt2_test.py @@ -16,19 +16,14 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import os import unittest -import json -import random -import shutil import pytest -import torch from pytorch_transformers import (GPT2Config, GPT2Model, - GPT2LMHeadModel, GPT2DoubleHeadsModel) + GPT2LMHeadModel, GPT2DoubleHeadsModel) -from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester) +from .modeling_common_test import CommonTestCases, ConfigTester class GPT2ModelTest(unittest.TestCase): @@ -37,14 +32,14 @@ class GPT2ModelTest(unittest.TestCase): config_tester.run_common_tests() def test_model(self): - model_tester = GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model, + model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model, lm_head_model_class=GPT2LMHeadModel, double_head_model_class=GPT2DoubleHeadsModel) model_tester.run_common_tests(test_presents=True) @pytest.mark.slow def test_pretrained(self): - model_tester = GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model, + model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model, lm_head_model_class=GPT2LMHeadModel, double_head_model_class=GPT2DoubleHeadsModel) model_tester.run_slow_tests() diff --git a/pytorch_transformers/tests/modeling_openai_test.py b/pytorch_transformers/tests/modeling_openai_test.py index 4f57f4661b..243afb9501 100644 --- a/pytorch_transformers/tests/modeling_openai_test.py +++ b/pytorch_transformers/tests/modeling_openai_test.py @@ -19,12 +19,11 @@ from __future__ import print_function import unittest import pytest -import torch from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) -from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester) +from .modeling_common_test import CommonTestCases, ConfigTester class OpenAIModelTest(unittest.TestCase): @@ -33,14 +32,14 @@ class OpenAIModelTest(unittest.TestCase): config_tester.run_common_tests() def test_model(self): - model_tester = GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel, + model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel, lm_head_model_class=OpenAIGPTLMHeadModel, double_head_model_class=OpenAIGPTDoubleHeadsModel) model_tester.run_common_tests(test_presents=False) @pytest.mark.slow def test_pretrained(self): - model_tester = GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel, + model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel, lm_head_model_class=OpenAIGPTLMHeadModel, double_head_model_class=OpenAIGPTDoubleHeadsModel) model_tester.run_slow_tests() diff --git a/pytorch_transformers/tests/modeling_transfo_xl_test.py b/pytorch_transformers/tests/modeling_transfo_xl_test.py index 9631cd6034..e3c0fbcdf0 100644 --- a/pytorch_transformers/tests/modeling_transfo_xl_test.py +++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py @@ -28,9 +28,15 @@ import torch from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel) from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor +from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor + +class TransfoXLModelTest(CommonTestCases.CommonModelTester): + + all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel) + test_pruning = False + test_torchscript = False + test_resize_embeddings = False -class TransfoXLModelTest(unittest.TestCase): class TransfoXLModelTester(object): def __init__(self, @@ -52,7 +58,6 @@ class TransfoXLModelTest(unittest.TestCase): num_hidden_layers=5, scope=None, seed=1, - all_model_classes=(TransfoXLModel, TransfoXLLMHeadModel), ): self.parent = parent self.batch_size = batch_size @@ -73,7 +78,6 @@ class TransfoXLModelTest(unittest.TestCase): self.num_hidden_layers = num_hidden_layers self.scope = scope self.seed = seed - self.all_model_classes = all_model_classes def prepare_config_and_inputs(self): input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) @@ -171,16 +175,31 @@ class TransfoXLModelTest(unittest.TestCase): list(list(mem.size()) for mem in result["mems_2"]), [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers) - def create_and_check_transfo_xl_commons(self, config, input_ids_1, input_ids_2, lm_labels): + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs inputs_dict = {'input_ids': input_ids_1} - create_and_check_commons(self, config, inputs_dict, test_pruning=False, test_torchscript=False) + return config, inputs_dict - def test_default(self): - self.run_tester(TransfoXLModelTest.TransfoXLModelTester(self)) + + def setUp(self): + self.model_tester = TransfoXLModelTest.TransfoXLModelTester(self) + self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37) def test_config(self): - config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37) - config_tester.run_common_tests() + self.config_tester.run_common_tests() + + def test_transfo_xl_model(self): + self.model_tester.set_seed() + config_and_inputs = self.model_tester.prepare_config_and_inputs() + output_result = self.model_tester.create_transfo_xl_model(*config_and_inputs) + self.model_tester.check_transfo_xl_model_output(output_result) + + def test_transfo_xl_lm_head(self): + self.model_tester.set_seed() + config_and_inputs = self.model_tester.prepare_config_and_inputs() + output_result = self.model_tester.create_transfo_xl_lm_head(*config_and_inputs) + self.model_tester.check_transfo_xl_lm_head_output(output_result) @pytest.mark.slow def test_model_from_pretrained(self): @@ -190,23 +209,6 @@ class TransfoXLModelTest(unittest.TestCase): shutil.rmtree(cache_dir) self.assertIsNotNone(model) - def run_tester(self, tester): - config_and_inputs = tester.prepare_config_and_inputs() - - tester.set_seed() - config_and_inputs = tester.prepare_config_and_inputs() - output_result = tester.create_transfo_xl_model(*config_and_inputs) - tester.check_transfo_xl_model_output(output_result) - - tester.set_seed() - config_and_inputs = tester.prepare_config_and_inputs() - output_result = tester.create_transfo_xl_lm_head(*config_and_inputs) - tester.check_transfo_xl_lm_head_output(output_result) - - tester.set_seed() - config_and_inputs = tester.prepare_config_and_inputs() - tester.create_and_check_transfo_xl_commons(*config_and_inputs) - if __name__ == "__main__": unittest.main() diff --git a/pytorch_transformers/tests/modeling_xlm_test.py b/pytorch_transformers/tests/modeling_xlm_test.py index 9d6bc4054d..85189859a6 100644 --- a/pytorch_transformers/tests/modeling_xlm_test.py +++ b/pytorch_transformers/tests/modeling_xlm_test.py @@ -23,10 +23,15 @@ import pytest from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification) from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor) +from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor) -class XLMModelTest(unittest.TestCase): +class XLMModelTest(CommonTestCases.CommonModelTester): + + all_model_classes = (XLMModel, XLMWithLMHeadModel, + XLMForQuestionAnswering, XLMForSequenceClassification) + # , XLMForSequenceClassification, XLMForTokenClassification), + class XLMModelTester(object): def __init__(self, @@ -58,8 +63,6 @@ class XLMModelTest(unittest.TestCase): summary_type="last", use_proj=True, scope=None, - all_model_classes = (XLMModel, XLMWithLMHeadModel, - XLMForQuestionAnswering, XLMForSequenceClassification), # , XLMForSequenceClassification, XLMForTokenClassification), ): self.parent = parent self.batch_size = batch_size @@ -90,7 +93,6 @@ class XLMModelTest(unittest.TestCase): self.num_labels = num_labels self.num_choices = num_choices self.scope = scope - self.all_model_classes = all_model_classes def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) @@ -237,28 +239,23 @@ class XLMModelTest(unittest.TestCase): [self.batch_size, self.type_sequence_label_size]) - def create_and_check_xlm_commons(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask): + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + (config, input_ids, token_type_ids, input_lengths, + sequence_labels, token_labels, is_impossible_labels, input_mask) = config_and_inputs inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'lengths': input_lengths} - create_and_check_commons(self, config, inputs_dict) + return config, inputs_dict - def test_default(self): - self.run_tester(XLMModelTest.XLMModelTester(self)) + def setUp(self): + self.model_tester = XLMModelTest.XLMModelTester(self) + self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37) def test_config(self): - config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37) - config_tester.run_common_tests() + self.config_tester.run_common_tests() - @pytest.mark.slow - def test_model_from_pretrained(self): - cache_dir = "/tmp/pytorch_transformers_test/" - for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) - self.assertIsNotNone(model) - - def run_tester(self, tester): - config_and_inputs = tester.prepare_config_and_inputs() - tester.create_and_check_xlm_model(*config_and_inputs) + def test_xlm_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_xlm_model(*config_and_inputs) # config_and_inputs = tester.prepare_config_and_inputs() # tester.create_and_check_xlm_for_masked_lm(*config_and_inputs) @@ -275,8 +272,14 @@ class XLMModelTest(unittest.TestCase): # config_and_inputs = tester.prepare_config_and_inputs() # tester.create_and_check_xlm_for_token_classification(*config_and_inputs) - config_and_inputs = tester.prepare_config_and_inputs() - tester.create_and_check_xlm_commons(*config_and_inputs) + @pytest.mark.slow + def test_model_from_pretrained(self): + cache_dir = "/tmp/pytorch_transformers_test/" + for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: + model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir) + shutil.rmtree(cache_dir) + self.assertIsNotNone(model) + if __name__ == "__main__": unittest.main() diff --git a/pytorch_transformers/tests/modeling_xlnet_test.py b/pytorch_transformers/tests/modeling_xlnet_test.py index 41c114ce9c..3792125d6e 100644 --- a/pytorch_transformers/tests/modeling_xlnet_test.py +++ b/pytorch_transformers/tests/modeling_xlnet_test.py @@ -28,9 +28,14 @@ import torch from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering) from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor +from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor + +class XLNetModelTest(CommonTestCases.CommonModelTester): + + all_model_classes=(XLNetModel, XLNetLMHeadModel, + XLNetForSequenceClassification, XLNetForQuestionAnswering) + test_pruning = False -class XLNetModelTest(unittest.TestCase): class XLNetModelTester(object): def __init__(self, @@ -56,8 +61,6 @@ class XLNetModelTest(unittest.TestCase): initializer_range=0.05, seed=1, type_vocab_size=2, - all_model_classes=(XLNetModel, XLNetLMHeadModel, - XLNetForSequenceClassification, XLNetForQuestionAnswering), ): self.parent = parent self.batch_size = batch_size @@ -82,7 +85,6 @@ class XLNetModelTest(unittest.TestCase): self.seed = seed self.type_vocab_size = type_vocab_size self.type_sequence_label_size = type_sequence_label_size - self.all_model_classes = all_model_classes def prepare_config_and_inputs(self): input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) @@ -264,17 +266,41 @@ class XLNetModelTest(unittest.TestCase): list(list(mem.size()) for mem in result["mems_1"]), [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers) - def create_and_check_xlnet_commons(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, - target_mapping, inp_q, segment_ids, lm_labels, sequence_labels, is_impossible_labels): + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, + target_mapping, inp_q, segment_ids, lm_labels, + sequence_labels, is_impossible_labels) = config_and_inputs inputs_dict = {'input_ids': input_ids_1} - create_and_check_commons(self, config, inputs_dict, test_pruning=False) + return config, inputs_dict - def test_default(self): - self.run_tester(XLNetModelTest.XLNetModelTester(self)) + + def setUp(self): + self.model_tester = XLNetModelTest.XLNetModelTester(self) + self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37) def test_config(self): - config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37) - config_tester.run_common_tests() + self.config_tester.run_common_tests() + + def test_xlnet_base_model(self): + self.model_tester.set_seed() + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_xlnet_base_model(*config_and_inputs) + + def test_xlnet_lm_head(self): + self.model_tester.set_seed() + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs) + + def test_xlnet_sequence_classif(self): + self.model_tester.set_seed() + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs) + + def test_xlnet_qa(self): + self.model_tester.set_seed() + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_xlnet_qa(*config_and_inputs) @pytest.mark.slow def test_model_from_pretrained(self): @@ -284,27 +310,6 @@ class XLNetModelTest(unittest.TestCase): shutil.rmtree(cache_dir) self.assertIsNotNone(model) - def run_tester(self, tester): - tester.set_seed() - config_and_inputs = tester.prepare_config_and_inputs() - tester.create_and_check_xlnet_base_model(*config_and_inputs) - - tester.set_seed() - config_and_inputs = tester.prepare_config_and_inputs() - tester.create_and_check_xlnet_lm_head(*config_and_inputs) - - tester.set_seed() - config_and_inputs = tester.prepare_config_and_inputs() - tester.create_and_check_xlnet_sequence_classif(*config_and_inputs) - - tester.set_seed() - config_and_inputs = tester.prepare_config_and_inputs() - tester.create_and_check_xlnet_qa(*config_and_inputs) - - tester.set_seed() - config_and_inputs = tester.prepare_config_and_inputs() - tester.create_and_check_xlnet_commons(*config_and_inputs) - if __name__ == "__main__": unittest.main()