diff --git a/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py b/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py index 4a1d66b591..d7689d2810 100644 --- a/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py +++ b/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py @@ -14,10 +14,12 @@ # limitations under the License. from __future__ import absolute_import, division, print_function +import unittest + from transformers import XxxConfig, is_tf_available from .test_configuration_common import ConfigTester -from .test_modeling_tf_common import TFCommonTestCases, ids_tensor +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_tf, slow @@ -32,7 +34,7 @@ if is_tf_available(): @require_tf -class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester): +class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase): all_model_classes = ( ( diff --git a/templates/adding_a_new_model/tests/test_modeling_xxx.py b/templates/adding_a_new_model/tests/test_modeling_xxx.py index b6364447b4..d66a91e3e4 100644 --- a/templates/adding_a_new_model/tests/test_modeling_xxx.py +++ b/templates/adding_a_new_model/tests/test_modeling_xxx.py @@ -14,10 +14,12 @@ # limitations under the License. from __future__ import absolute_import, division, print_function +import unittest + from transformers import is_torch_available from .test_configuration_common import ConfigTester -from .test_modeling_common import CommonTestCases, ids_tensor +from .test_modeling_common import ModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_torch, slow, torch_device @@ -34,7 +36,7 @@ if is_torch_available(): @require_torch -class XxxModelTest(CommonTestCases.CommonModelTester): +class XxxModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = ( (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering, XxxForSequenceClassification, XxxForTokenClassification) diff --git a/tests/test_modeling_albert.py b/tests/test_modeling_albert.py index eeecb0d4d4..67d4b1bbba 100644 --- a/tests/test_modeling_albert.py +++ b/tests/test_modeling_albert.py @@ -14,10 +14,12 @@ # limitations under the License. from __future__ import absolute_import, division, print_function +import unittest + from transformers import is_torch_available from .test_configuration_common import ConfigTester -from .test_modeling_common import CommonTestCases, ids_tensor +from .test_modeling_common import ModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_torch, slow, torch_device @@ -33,7 +35,7 @@ if is_torch_available(): @require_torch -class AlbertModelTest(CommonTestCases.CommonModelTester): +class AlbertModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (AlbertModel, AlbertForMaskedLM) if is_torch_available() else () diff --git a/tests/test_modeling_bert.py b/tests/test_modeling_bert.py index 52aaece535..3909e3e95e 100644 --- a/tests/test_modeling_bert.py +++ b/tests/test_modeling_bert.py @@ -14,10 +14,12 @@ # limitations under the License. from __future__ import absolute_import, division, print_function +import unittest + from transformers import is_torch_available from .test_configuration_common import ConfigTester -from .test_modeling_common import CommonTestCases, floats_tensor, ids_tensor +from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from .utils import CACHE_DIR, require_torch, slow, torch_device @@ -37,7 +39,7 @@ if is_torch_available(): @require_torch -class BertModelTest(CommonTestCases.CommonModelTester): +class BertModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = ( ( diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 0d1cfbd311..b2ef0c74d9 100644 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -69,737 +69,737 @@ def _config_zero_init(config): return configs_no_init -class CommonTestCases: - @require_torch - class CommonModelTester(unittest.TestCase): +@require_torch +class ModelTesterMixin: - model_tester = None - all_model_classes = () - test_torchscript = True - test_pruning = True - test_resize_embeddings = True - test_head_masking = True - is_encoder_decoder = False + model_tester = None + all_model_classes = () + test_torchscript = True + test_pruning = True + test_resize_embeddings = True + test_head_masking = True + is_encoder_decoder = False - def test_save_load(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + def test_save_load(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: - model = model_class(config) + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**inputs_dict) + out_2 = outputs[0].numpy() + out_2[np.isnan(out_2)] = 0 + + with TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained(tmpdirname) model.to(torch_device) - model.eval() with torch.no_grad(): - outputs = model(**inputs_dict) - out_2 = outputs[0].numpy() - out_2[np.isnan(out_2)] = 0 + after_outputs = model(**inputs_dict) - with TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname) - model.to(torch_device) - with torch.no_grad(): - after_outputs = model(**inputs_dict) - - # Make sure we don't have nans - out_1 = after_outputs[0].cpu().numpy() - out_1[np.isnan(out_1)] = 0 - max_diff = np.amax(np.abs(out_1 - out_2)) - self.assertLessEqual(max_diff, 1e-5) - - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - self.assertIn( - param.data.mean().item(), - [0.0, 1.0], - msg="Parameter {} of model {} seems not properly initialized".format(name, model_class), - ) - - def test_determinism(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - model.to(torch_device) - model.eval() - with torch.no_grad(): - first = model(**inputs_dict)[0] - second = model(**inputs_dict)[0] - out_1 = first.cpu().numpy() - out_2 = second.cpu().numpy() - out_1 = out_1[~np.isnan(out_1)] - out_2 = out_2[~np.isnan(out_2)] + # Make sure we don't have nans + out_1 = after_outputs[0].cpu().numpy() + out_1[np.isnan(out_1)] = 0 max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5) - def test_attention_outputs(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - decoder_seq_length = ( - self.model_tester.decoder_seq_length - if hasattr(self.model_tester, "decoder_seq_length") - else self.model_tester.seq_length - ) - encoder_seq_length = ( - self.model_tester.encoder_seq_length - if hasattr(self.model_tester, "encoder_seq_length") - else self.model_tester.seq_length - ) - decoder_key_length = ( - self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length - ) - encoder_key_length = ( - self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length - ) - - for model_class in self.all_model_classes: - config.output_attentions = True - config.output_hidden_states = False - model = model_class(config) - model.to(torch_device) - model.eval() - with torch.no_grad(): - outputs = model(**inputs_dict) - attentions = outputs[-1] - self.assertEqual(model.config.output_attentions, True) - self.assertEqual(model.config.output_hidden_states, False) - self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) - self.assertListEqual( - list(attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], - ) - out_len = len(outputs) - - if self.is_encoder_decoder: - self.assertEqual(out_len % 2, 0) - decoder_attentions = outputs[(out_len // 2) - 1] - self.assertEqual(model.config.output_attentions, True) - self.assertEqual(model.config.output_hidden_states, False) - self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) - self.assertListEqual( - list(decoder_attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + self.assertIn( + param.data.mean().item(), + [0.0, 1.0], + msg="Parameter {} of model {} seems not properly initialized".format(name, model_class), ) - # Check attention is always last and order is fine - config.output_attentions = True - config.output_hidden_states = True - model = model_class(config) - model.to(torch_device) - model.eval() - with torch.no_grad(): - outputs = model(**inputs_dict) - self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs)) - self.assertEqual(model.config.output_attentions, True) - self.assertEqual(model.config.output_hidden_states, True) + def test_determinism(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - self_attentions = outputs[-1] - self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + first = model(**inputs_dict)[0] + second = model(**inputs_dict)[0] + out_1 = first.cpu().numpy() + out_2 = second.cpu().numpy() + out_1 = out_1[~np.isnan(out_1)] + out_2 = out_2[~np.isnan(out_2)] + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + decoder_seq_length = ( + self.model_tester.decoder_seq_length + if hasattr(self.model_tester, "decoder_seq_length") + else self.model_tester.seq_length + ) + encoder_seq_length = ( + self.model_tester.encoder_seq_length + if hasattr(self.model_tester, "encoder_seq_length") + else self.model_tester.seq_length + ) + decoder_key_length = ( + self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length + ) + encoder_key_length = ( + self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length + ) + + for model_class in self.all_model_classes: + config.output_attentions = True + config.output_hidden_states = False + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**inputs_dict) + attentions = outputs[-1] + self.assertEqual(model.config.output_attentions, True) + self.assertEqual(model.config.output_hidden_states, False) + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) + out_len = len(outputs) + + if self.is_encoder_decoder: + self.assertEqual(out_len % 2, 0) + decoder_attentions = outputs[(out_len // 2) - 1] + self.assertEqual(model.config.output_attentions, True) + self.assertEqual(model.config.output_hidden_states, False) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( - list(self_attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], ) - def test_torchscript(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - self._create_and_check_torchscript(config, inputs_dict) - - def test_torchscript_output_attentions(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - + # Check attention is always last and order is fine config.output_attentions = True - self._create_and_check_torchscript(config, inputs_dict) - - def test_torchscript_output_hidden_state(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - config.output_hidden_states = True - self._create_and_check_torchscript(config, inputs_dict) + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**inputs_dict) + self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs)) + self.assertEqual(model.config.output_attentions, True) + self.assertEqual(model.config.output_hidden_states, True) - def _create_and_check_torchscript(self, config, inputs_dict): - if not self.test_torchscript: - return + self_attentions = outputs[-1] + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) - configs_no_init = _config_zero_init(config) # To be sure we have no Nan - configs_no_init.torchscript = True - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - model.to(torch_device) - model.eval() - inputs = inputs_dict["input_ids"] # Let's keep only input_ids + def test_torchscript(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + self._create_and_check_torchscript(config, inputs_dict) + + def test_torchscript_output_attentions(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + config.output_attentions = True + self._create_and_check_torchscript(config, inputs_dict) + + def test_torchscript_output_hidden_state(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + config.output_hidden_states = True + self._create_and_check_torchscript(config, inputs_dict) + + def _create_and_check_torchscript(self, config, inputs_dict): + if not self.test_torchscript: + return + + configs_no_init = _config_zero_init(config) # To be sure we have no Nan + configs_no_init.torchscript = True + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + model.to(torch_device) + model.eval() + inputs = inputs_dict["input_ids"] # Let's keep only input_ids + + try: + traced_gpt2 = torch.jit.trace(model, inputs) + except RuntimeError: + self.fail("Couldn't trace module.") + + with TemporaryDirectory() as tmp_dir_name: + pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt") try: - traced_gpt2 = torch.jit.trace(model, inputs) - except RuntimeError: - self.fail("Couldn't trace module.") + torch.jit.save(traced_gpt2, pt_file_name) + except Exception: + self.fail("Couldn't save module.") - with TemporaryDirectory() as tmp_dir_name: - pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt") + try: + loaded_model = torch.jit.load(pt_file_name) + except Exception: + self.fail("Couldn't load module.") - try: - torch.jit.save(traced_gpt2, pt_file_name) - except Exception: - self.fail("Couldn't save module.") + model.to(torch_device) + model.eval() - try: - loaded_model = torch.jit.load(pt_file_name) - except Exception: - self.fail("Couldn't load module.") + loaded_model.to(torch_device) + loaded_model.eval() - model.to(torch_device) - model.eval() + model_params = model.parameters() + loaded_model_params = loaded_model.parameters() - loaded_model.to(torch_device) - loaded_model.eval() + models_equal = True + for p1, p2 in zip(model_params, loaded_model_params): + if p1.data.ne(p2.data).sum() > 0: + models_equal = False - model_params = model.parameters() - loaded_model_params = loaded_model.parameters() + self.assertTrue(models_equal) - models_equal = True - for p1, p2 in zip(model_params, loaded_model_params): - if p1.data.ne(p2.data).sum() > 0: - models_equal = False + def test_headmasking(self): + if not self.test_head_masking: + return - self.assertTrue(models_equal) + global_rng.seed(42) + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + global_rng.seed() - def test_headmasking(self): - if not self.test_head_masking: - return + config.output_attentions = True + config.output_hidden_states = True + configs_no_init = _config_zero_init(config) # To be sure we have no Nan + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + model.to(torch_device) + model.eval() - global_rng.seed(42) + # Prepare head_mask + # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) + head_mask = torch.ones( + self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device + ) + head_mask[0, 0] = 0 + head_mask[-1, :-1] = 0 + head_mask.requires_grad_(requires_grad=True) + inputs = inputs_dict.copy() + inputs["head_mask"] = head_mask + + outputs = model(**inputs) + + # Test that we can get a gradient back for importance score computation + output = sum(t.sum() for t in outputs[0]) + output = output.sum() + output.backward() + multihead_outputs = head_mask.grad + + attentions = outputs[-1] + hidden_states = outputs[-2] + + # Remove Nan + for t in attentions: + self.assertLess( + torch.sum(torch.isnan(t)), t.numel() / 4 + ) # Check we don't have more than 25% nans (arbitrary) + attentions = [ + t.masked_fill(torch.isnan(t), 0.0) for t in attentions + ] # remove them (the test is less complete) + + self.assertIsNotNone(multihead_outputs) + self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers) + self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0) + self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0) + self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0) + self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0) + self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0) + + def test_head_pruning(self): + if not self.test_pruning: + return + + for model_class in self.all_model_classes: config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - global_rng.seed() + + if "head_mask" in inputs_dict: + del inputs_dict["head_mask"] config.output_attentions = True + config.output_hidden_states = False + model = model_class(config=config) + model.to(torch_device) + model.eval() + heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]} + model.prune_heads(heads_to_prune) + with torch.no_grad(): + outputs = model(**inputs_dict) + + attentions = outputs[-1] + + self.assertEqual(attentions[0].shape[-3], 1) + self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) + self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) + + def test_head_pruning_save_load_from_pretrained(self): + if not self.test_pruning: + return + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + if "head_mask" in inputs_dict: + del inputs_dict["head_mask"] + + config.output_attentions = True + config.output_hidden_states = False + model = model_class(config=config) + model.to(torch_device) + model.eval() + heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]} + model.prune_heads(heads_to_prune) + + with TemporaryDirectory() as temp_dir_name: + model.save_pretrained(temp_dir_name) + model = model_class.from_pretrained(temp_dir_name) + model.to(torch_device) + + with torch.no_grad(): + outputs = model(**inputs_dict) + attentions = outputs[-1] + self.assertEqual(attentions[0].shape[-3], 1) + self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) + self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) + + def test_head_pruning_save_load_from_config_init(self): + if not self.test_pruning: + return + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + if "head_mask" in inputs_dict: + del inputs_dict["head_mask"] + + config.output_attentions = True + config.output_hidden_states = False + + heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]} + config.pruned_heads = heads_to_prune + + model = model_class(config=config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**inputs_dict) + attentions = outputs[-1] + + self.assertEqual(attentions[0].shape[-3], 1) + self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) + self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) + + def test_head_pruning_integration(self): + if not self.test_pruning: + return + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + if "head_mask" in inputs_dict: + del inputs_dict["head_mask"] + + config.output_attentions = True + config.output_hidden_states = False + + heads_to_prune = {0: [0], 1: [1, 2]} + config.pruned_heads = heads_to_prune + + model = model_class(config=config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**inputs_dict) + attentions = outputs[-1] + + self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) + self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) + self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads) + self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) + + with TemporaryDirectory() as temp_dir_name: + model.save_pretrained(temp_dir_name) + model = model_class.from_pretrained(temp_dir_name) + model.to(torch_device) + + with torch.no_grad(): + outputs = model(**inputs_dict) + attentions = outputs[-1] + + self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) + self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) + self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads) + self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) + + heads_to_prune = {0: [0], 2: [1, 2]} + model.prune_heads(heads_to_prune) + + with torch.no_grad(): + outputs = model(**inputs_dict) + attentions = outputs[-1] + + self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) + self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) + self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2) + self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) + + self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]}) + + def test_hidden_states_output(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: config.output_hidden_states = True - configs_no_init = _config_zero_init(config) # To be sure we have no Nan - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - model.to(torch_device) - model.eval() + config.output_attentions = False + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**inputs_dict) + hidden_states = outputs[-1] + self.assertEqual(model.config.output_attentions, False) + self.assertEqual(model.config.output_hidden_states, True) + self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1) + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [ + self.model_tester.encoder_seq_length + if hasattr(self.model_tester, "encoder_seq_length") + else self.model_tester.seq_length, + self.model_tester.hidden_size, + ], + ) - # Prepare head_mask - # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) - head_mask = torch.ones( - self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device - ) - head_mask[0, 0] = 0 - head_mask[-1, :-1] = 0 - head_mask.requires_grad_(requires_grad=True) - inputs = inputs_dict.copy() - inputs["head_mask"] = head_mask + def test_resize_tokens_embeddings(self): + original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + if not self.test_resize_embeddings: + return - outputs = model(**inputs) + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = model_class(config) - # Test that we can get a gradient back for importance score computation - output = sum(t.sum() for t in outputs[0]) - output = output.sum() - output.backward() - multihead_outputs = head_mask.grad + model_vocab_size = config.vocab_size + # Retrieve the embeddings and clone theme + model_embed = model.resize_token_embeddings(model_vocab_size) + cloned_embeddings = model_embed.weight.clone() - attentions = outputs[-1] - hidden_states = outputs[-2] + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size + 10) + self.assertEqual(model.config.vocab_size, model_vocab_size + 10) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) - # Remove Nan - for t in attentions: - self.assertLess( - torch.sum(torch.isnan(t)), t.numel() / 4 - ) # Check we don't have more than 25% nans (arbitrary) - attentions = [ - t.masked_fill(torch.isnan(t), 0.0) for t in attentions - ] # remove them (the test is less complete) + # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size - 15) + self.assertEqual(model.config.vocab_size, model_vocab_size - 15) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15) - self.assertIsNotNone(multihead_outputs) - self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers) - self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0) - self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0) - self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0) - self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0) - self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0) + # Check that adding and removing tokens has not modified the first part of the embedding matrix. + models_equal = True + for p1, p2 in zip(cloned_embeddings, model_embed.weight): + if p1.data.ne(p2.data).sum() > 0: + models_equal = False - def test_head_pruning(self): - if not self.test_pruning: - return + self.assertTrue(models_equal) - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + def test_model_common_attributes(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - if "head_mask" in inputs_dict: - del inputs_dict["head_mask"] + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Embedding, AdaptiveEmbedding)) + model.set_input_embeddings(torch.nn.Embedding(10, 10)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, torch.nn.Linear)) - config.output_attentions = True - config.output_hidden_states = False - model = model_class(config=config) - model.to(torch_device) - model.eval() - heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]} - model.prune_heads(heads_to_prune) - with torch.no_grad(): - outputs = model(**inputs_dict) + def test_tie_model_weights(self): + if not self.test_torchscript: + return - attentions = outputs[-1] + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - self.assertEqual(attentions[0].shape[-3], 1) - self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) - self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) + def check_same_values(layer_1, layer_2): + equal = True + for p1, p2 in zip(layer_1.weight, layer_2.weight): + if p1.data.ne(p2.data).sum() > 0: + equal = False + return equal - def test_head_pruning_save_load_from_pretrained(self): - if not self.test_pruning: - return + for model_class in self.all_model_classes: + config.torchscript = True + model_not_tied = model_class(config) + if model_not_tied.get_output_embeddings() is None: + continue - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + params_not_tied = list(model_not_tied.parameters()) - if "head_mask" in inputs_dict: - del inputs_dict["head_mask"] + config_tied = copy.deepcopy(config) + config_tied.torchscript = False + model_tied = model_class(config_tied) + params_tied = list(model_tied.parameters()) - config.output_attentions = True - config.output_hidden_states = False - model = model_class(config=config) - model.to(torch_device) - model.eval() - heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]} - model.prune_heads(heads_to_prune) + # Check that the embedding layer and decoding layer are the same in size and in value + self.assertGreater(len(params_not_tied), len(params_tied)) + # self.assertTrue(check_same_values(embeddings, decoding)) - with TemporaryDirectory() as temp_dir_name: - model.save_pretrained(temp_dir_name) - model = model_class.from_pretrained(temp_dir_name) - model.to(torch_device) + # # Check that after modification, they remain the same. + # embeddings.weight.data.div_(2) + # # Check that the embedding layer and decoding layer are the same in size and in value + # self.assertTrue(embeddings.weight.shape, decoding.weight.shape) + # self.assertTrue(check_same_values(embeddings, decoding)) - with torch.no_grad(): - outputs = model(**inputs_dict) - attentions = outputs[-1] - self.assertEqual(attentions[0].shape[-3], 1) - self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) - self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) + # # Check that after modification, they remain the same. + # decoding.weight.data.div_(4) + # # Check that the embedding layer and decoding layer are the same in size and in value + # self.assertTrue(embeddings.weight.shape, decoding.weight.shape) + # self.assertTrue(check_same_values(embeddings, decoding)) - def test_head_pruning_save_load_from_config_init(self): - if not self.test_pruning: - return + # Check that after resize they remain tied. + model_tied.resize_token_embeddings(config.vocab_size + 10) + params_tied_2 = list(model_tied.parameters()) + self.assertGreater(len(params_not_tied), len(params_tied)) + self.assertEqual(len(params_tied_2), len(params_tied)) - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + # decoding.weight.data.mul_(20) + # # Check that the embedding layer and decoding layer are the same in size and in value + # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape) + # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head)) - if "head_mask" in inputs_dict: - del inputs_dict["head_mask"] + def test_inputs_embeds(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + if not self.is_encoder_decoder: + input_ids = inputs_dict["input_ids"] + del inputs_dict["input_ids"] + else: + encoder_input_ids = inputs_dict["encoder_input_ids"] + decoder_input_ids = inputs_dict["decoder_input_ids"] + del inputs_dict["encoder_input_ids"] + del inputs_dict["decoder_input_ids"] - config.output_attentions = True - config.output_hidden_states = False + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() - heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]} - config.pruned_heads = heads_to_prune - - model = model_class(config=config) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - outputs = model(**inputs_dict) - attentions = outputs[-1] - - self.assertEqual(attentions[0].shape[-3], 1) - self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) - self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) - - def test_head_pruning_integration(self): - if not self.test_pruning: - return - - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - if "head_mask" in inputs_dict: - del inputs_dict["head_mask"] - - config.output_attentions = True - config.output_hidden_states = False - - heads_to_prune = {0: [0], 1: [1, 2]} - config.pruned_heads = heads_to_prune - - model = model_class(config=config) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - outputs = model(**inputs_dict) - attentions = outputs[-1] - - self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) - self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) - self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads) - self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) - - with TemporaryDirectory() as temp_dir_name: - model.save_pretrained(temp_dir_name) - model = model_class.from_pretrained(temp_dir_name) - model.to(torch_device) - - with torch.no_grad(): - outputs = model(**inputs_dict) - attentions = outputs[-1] - - self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) - self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) - self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads) - self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) - - heads_to_prune = {0: [0], 2: [1, 2]} - model.prune_heads(heads_to_prune) - - with torch.no_grad(): - outputs = model(**inputs_dict) - attentions = outputs[-1] - - self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) - self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) - self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2) - self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) - - self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]}) - - def test_hidden_states_output(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - config.output_hidden_states = True - config.output_attentions = False - model = model_class(config) - model.to(torch_device) - model.eval() - with torch.no_grad(): - outputs = model(**inputs_dict) - hidden_states = outputs[-1] - self.assertEqual(model.config.output_attentions, False) - self.assertEqual(model.config.output_hidden_states, True) - self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1) - self.assertListEqual( - list(hidden_states[0].shape[-2:]), - [ - self.model_tester.encoder_seq_length - if hasattr(self.model_tester, "encoder_seq_length") - else self.model_tester.seq_length, - self.model_tester.hidden_size, - ], - ) - - def test_resize_tokens_embeddings(self): - original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - if not self.test_resize_embeddings: - return - - for model_class in self.all_model_classes: - config = copy.deepcopy(original_config) - model = model_class(config) - - model_vocab_size = config.vocab_size - # Retrieve the embeddings and clone theme - model_embed = model.resize_token_embeddings(model_vocab_size) - cloned_embeddings = model_embed.weight.clone() - - # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size - model_embed = model.resize_token_embeddings(model_vocab_size + 10) - self.assertEqual(model.config.vocab_size, model_vocab_size + 10) - # Check that it actually resizes the embeddings matrix - self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) - - # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size - model_embed = model.resize_token_embeddings(model_vocab_size - 15) - self.assertEqual(model.config.vocab_size, model_vocab_size - 15) - # Check that it actually resizes the embeddings matrix - self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15) - - # Check that adding and removing tokens has not modified the first part of the embedding matrix. - models_equal = True - for p1, p2 in zip(cloned_embeddings, model_embed.weight): - if p1.data.ne(p2.data).sum() > 0: - models_equal = False - - self.assertTrue(models_equal) - - def test_model_common_attributes(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Embedding, AdaptiveEmbedding)) - model.set_input_embeddings(torch.nn.Embedding(10, 10)) - x = model.get_output_embeddings() - self.assertTrue(x is None or isinstance(x, torch.nn.Linear)) - - def test_tie_model_weights(self): - if not self.test_torchscript: - return - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - def check_same_values(layer_1, layer_2): - equal = True - for p1, p2 in zip(layer_1.weight, layer_2.weight): - if p1.data.ne(p2.data).sum() > 0: - equal = False - return equal - - for model_class in self.all_model_classes: - config.torchscript = True - model_not_tied = model_class(config) - if model_not_tied.get_output_embeddings() is None: - continue - - params_not_tied = list(model_not_tied.parameters()) - - config_tied = copy.deepcopy(config) - config_tied.torchscript = False - model_tied = model_class(config_tied) - params_tied = list(model_tied.parameters()) - - # Check that the embedding layer and decoding layer are the same in size and in value - self.assertGreater(len(params_not_tied), len(params_tied)) - # self.assertTrue(check_same_values(embeddings, decoding)) - - # # Check that after modification, they remain the same. - # embeddings.weight.data.div_(2) - # # Check that the embedding layer and decoding layer are the same in size and in value - # self.assertTrue(embeddings.weight.shape, decoding.weight.shape) - # self.assertTrue(check_same_values(embeddings, decoding)) - - # # Check that after modification, they remain the same. - # decoding.weight.data.div_(4) - # # Check that the embedding layer and decoding layer are the same in size and in value - # self.assertTrue(embeddings.weight.shape, decoding.weight.shape) - # self.assertTrue(check_same_values(embeddings, decoding)) - - # Check that after resize they remain tied. - model_tied.resize_token_embeddings(config.vocab_size + 10) - params_tied_2 = list(model_tied.parameters()) - self.assertGreater(len(params_not_tied), len(params_tied)) - self.assertEqual(len(params_tied_2), len(params_tied)) - - # decoding.weight.data.mul_(20) - # # Check that the embedding layer and decoding layer are the same in size and in value - # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape) - # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head)) - - def test_inputs_embeds(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + wte = model.get_input_embeddings() if not self.is_encoder_decoder: - input_ids = inputs_dict["input_ids"] - del inputs_dict["input_ids"] + inputs_dict["inputs_embeds"] = wte(input_ids) else: - encoder_input_ids = inputs_dict["encoder_input_ids"] - decoder_input_ids = inputs_dict["decoder_input_ids"] - del inputs_dict["encoder_input_ids"] - del inputs_dict["decoder_input_ids"] - - for model_class in self.all_model_classes: - model = model_class(config) - model.to(torch_device) - model.eval() - - wte = model.get_input_embeddings() - if not self.is_encoder_decoder: - inputs_dict["inputs_embeds"] = wte(input_ids) - else: - inputs_dict["encoder_inputs_embeds"] = wte(encoder_input_ids) - inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids) - - with torch.no_grad(): - outputs = model(**inputs_dict) - - class GPTModelTester(CommonModelTester): - def __init__( - self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_position_ids=True, - use_token_type_ids=True, - use_labels=True, - vocab_size=99, - n_positions=33, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - n_choices=3, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - scope=None, - config_class=None, - base_model_class=None, - lm_head_model_class=None, - double_head_model_class=None, - ): - self.parent = parent - self.batch_size = batch_size - self.seq_length = seq_length - self.is_training = is_training - self.use_position_ids = use_position_ids - self.use_token_type_ids = use_token_type_ids - self.use_labels = use_labels - self.vocab_size = vocab_size - self.n_positions = n_positions - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.n_choices = n_choices - self.type_sequence_label_size = type_sequence_label_size - self.initializer_range = initializer_range - self.num_labels = num_labels - self.scope = scope - self.config_class = config_class - self.base_model_class = base_model_class - self.lm_head_model_class = lm_head_model_class - self.double_head_model_class = double_head_model_class - self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class) - - def prepare_config_and_inputs(self): - total_num_tokens = self.vocab_size - input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens) - - position_ids = None - if self.use_position_ids: - position_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions) - - token_type_ids = None - if self.use_token_type_ids: - total_voc = self.vocab_size - token_type_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc) - - mc_labels = None - lm_labels = None - mc_token_ids = None - if self.use_labels: - mc_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) - lm_labels = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels) - mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length) - - config = self.config_class( - vocab_size=self.vocab_size, - n_positions=self.n_positions, - n_embd=self.hidden_size, - n_layer=self.num_hidden_layers, - n_head=self.num_attention_heads, - initializer_range=self.initializer_range, - ) - - return (config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids) - - def create_and_check_base_model( - self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids - ): - model = self.base_model_class(config) - model.to(torch_device) - model.eval() + inputs_dict["encoder_inputs_embeds"] = wte(encoder_input_ids) + inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids) with torch.no_grad(): - outputs = model(input_ids, position_ids, token_type_ids) - outputs = model(input_ids, position_ids) + outputs = model(**inputs_dict) + + +class GPTModelTester(ModelTesterMixin): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_position_ids=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + n_positions=33, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + n_choices=3, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + scope=None, + config_class=None, + base_model_class=None, + lm_head_model_class=None, + double_head_model_class=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_position_ids = use_position_ids + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.n_positions = n_positions + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.n_choices = n_choices + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.scope = scope + self.config_class = config_class + self.base_model_class = base_model_class + self.lm_head_model_class = lm_head_model_class + self.double_head_model_class = double_head_model_class + self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class) + + def prepare_config_and_inputs(self): + total_num_tokens = self.vocab_size + input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens) + + position_ids = None + if self.use_position_ids: + position_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions) + + token_type_ids = None + if self.use_token_type_ids: + total_voc = self.vocab_size + token_type_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc) + + mc_labels = None + lm_labels = None + mc_token_ids = None + if self.use_labels: + mc_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + lm_labels = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels) + mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length) + + config = self.config_class( + vocab_size=self.vocab_size, + n_positions=self.n_positions, + n_embd=self.hidden_size, + n_layer=self.num_hidden_layers, + n_head=self.num_attention_heads, + initializer_range=self.initializer_range, + ) + + return (config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids) + + def create_and_check_base_model( + self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids + ): + model = self.base_model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(input_ids, position_ids, token_type_ids) + outputs = model(input_ids, position_ids) + outputs = model(input_ids) + + hidden_state = outputs[0] + self.parent.assertListEqual( + list(hidden_state.size()), [self.batch_size, self.n_choices, self.seq_length, self.hidden_size] + ) + + def create_and_check_lm_head( + self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids + ): + model = self.lm_head_model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(input_ids, position_ids, token_type_ids, lm_labels) + loss, lm_logits = outputs[:2] + + total_voc = self.vocab_size + self.parent.assertListEqual( + list(lm_logits.size()), [self.batch_size, self.n_choices, self.seq_length, total_voc] + ) + self.parent.assertListEqual(list(loss.size()), []) + + def create_and_check_presents( + self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids + ): + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): outputs = model(input_ids) - - hidden_state = outputs[0] + presents = outputs[-1] + self.parent.assertEqual(self.num_hidden_layers, len(presents)) self.parent.assertListEqual( - list(hidden_state.size()), [self.batch_size, self.n_choices, self.seq_length, self.hidden_size] + list(presents[0].size()), + [ + 2, + self.batch_size * self.n_choices, + self.num_attention_heads, + self.seq_length, + self.hidden_size // self.num_attention_heads, + ], ) - def create_and_check_lm_head( - self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids - ): - model = self.lm_head_model_class(config) - model.to(torch_device) - model.eval() - with torch.no_grad(): - outputs = model(input_ids, position_ids, token_type_ids, lm_labels) - loss, lm_logits = outputs[:2] - - total_voc = self.vocab_size - self.parent.assertListEqual( - list(lm_logits.size()), [self.batch_size, self.n_choices, self.seq_length, total_voc] + def create_and_check_double_heads( + self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids + ): + model = self.double_head_model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model( + input_ids, + mc_token_ids, + lm_labels=lm_labels, + mc_labels=mc_labels, + token_type_ids=token_type_ids, + position_ids=position_ids, ) - self.parent.assertListEqual(list(loss.size()), []) + lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4] + loss = [lm_loss, mc_loss] - def create_and_check_presents( - self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids - ): - for model_class in self.all_model_classes: - model = model_class(config) - model.to(torch_device) - model.eval() - with torch.no_grad(): - outputs = model(input_ids) - presents = outputs[-1] - self.parent.assertEqual(self.num_hidden_layers, len(presents)) - self.parent.assertListEqual( - list(presents[0].size()), - [ - 2, - self.batch_size * self.n_choices, - self.num_attention_heads, - self.seq_length, - self.hidden_size // self.num_attention_heads, - ], - ) + total_voc = self.vocab_size + self.parent.assertListEqual( + list(lm_logits.size()), [self.batch_size, self.n_choices, self.seq_length, total_voc] + ) + self.parent.assertListEqual(list(mc_logits.size()), [self.batch_size, self.n_choices]) + self.parent.assertListEqual([list(l.size()) for l in loss], [[], []]) - def create_and_check_double_heads( - self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids - ): - model = self.double_head_model_class(config) - model.to(torch_device) - model.eval() - with torch.no_grad(): - outputs = model( - input_ids, - mc_token_ids, - lm_labels=lm_labels, - mc_labels=mc_labels, - token_type_ids=token_type_ids, - position_ids=position_ids, - ) - lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4] - loss = [lm_loss, mc_loss] + def create_and_check_model_from_pretrained(self): + for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]: + model = self.base_model_class.from_pretrained(model_name, cache_dir=CACHE_DIR) + self.parent.assertIsNotNone(model) - total_voc = self.vocab_size - self.parent.assertListEqual( - list(lm_logits.size()), [self.batch_size, self.n_choices, self.seq_length, total_voc] - ) - self.parent.assertListEqual(list(mc_logits.size()), [self.batch_size, self.n_choices]) - self.parent.assertListEqual([list(l.size()) for l in loss], [[], []]) + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + (config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids) = config_and_inputs + inputs_dict = {"input_ids": input_ids} + return config, inputs_dict - def create_and_check_model_from_pretrained(self): - for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]: - model = self.base_model_class.from_pretrained(model_name, cache_dir=CACHE_DIR) - self.parent.assertIsNotNone(model) + def run_common_tests(self, test_presents=False): + config_and_inputs = self.prepare_config_and_inputs() + self.create_and_check_base_model(*config_and_inputs) - def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + self.create_and_check_lm_head(*config_and_inputs) + + config_and_inputs = self.prepare_config_and_inputs() + self.create_and_check_double_heads(*config_and_inputs) + + if test_presents: config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids) = config_and_inputs - inputs_dict = {"input_ids": input_ids} - return config, inputs_dict + self.create_and_check_presents(*config_and_inputs) - def run_common_tests(self, test_presents=False): - config_and_inputs = self.prepare_config_and_inputs() - self.create_and_check_base_model(*config_and_inputs) - - config_and_inputs = self.prepare_config_and_inputs() - self.create_and_check_lm_head(*config_and_inputs) - - config_and_inputs = self.prepare_config_and_inputs() - self.create_and_check_double_heads(*config_and_inputs) - - if test_presents: - config_and_inputs = self.prepare_config_and_inputs() - self.create_and_check_presents(*config_and_inputs) - - @slow - def run_slow_tests(self): - self.create_and_check_model_from_pretrained() + @slow + def run_slow_tests(self): + self.create_and_check_model_from_pretrained() class ConfigTester(object): diff --git a/tests/test_modeling_ctrl.py b/tests/test_modeling_ctrl.py index 519b529299..b2fa4941ab 100644 --- a/tests/test_modeling_ctrl.py +++ b/tests/test_modeling_ctrl.py @@ -13,10 +13,12 @@ # limitations under the License. from __future__ import absolute_import, division, print_function +import unittest + from transformers import is_torch_available from .test_configuration_common import ConfigTester -from .test_modeling_common import CommonTestCases, ids_tensor +from .test_modeling_common import ModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_torch, slow, torch_device @@ -25,7 +27,7 @@ if is_torch_available(): @require_torch -class CTRLModelTest(CommonTestCases.CommonModelTester): +class CTRLModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (CTRLModel, CTRLLMHeadModel) if is_torch_available() else () test_pruning = False diff --git a/tests/test_modeling_distilbert.py b/tests/test_modeling_distilbert.py index 6c83751bad..b63c50a8e0 100644 --- a/tests/test_modeling_distilbert.py +++ b/tests/test_modeling_distilbert.py @@ -14,10 +14,12 @@ # limitations under the License. from __future__ import absolute_import, division, print_function +import unittest + from transformers import is_torch_available from .test_configuration_common import ConfigTester -from .test_modeling_common import CommonTestCases, ids_tensor +from .test_modeling_common import ModelTesterMixin, ids_tensor from .utils import require_torch, torch_device @@ -33,7 +35,7 @@ if is_torch_available(): @require_torch -class DistilBertModelTest(CommonTestCases.CommonModelTester): +class DistilBertModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = ( (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification) diff --git a/tests/test_modeling_gpt2.py b/tests/test_modeling_gpt2.py index 2ca8f14d7a..55422d0ab7 100644 --- a/tests/test_modeling_gpt2.py +++ b/tests/test_modeling_gpt2.py @@ -14,10 +14,12 @@ # limitations under the License. from __future__ import absolute_import, division, print_function +import unittest + from transformers import is_torch_available from .test_configuration_common import ConfigTester -from .test_modeling_common import CommonTestCases, ids_tensor +from .test_modeling_common import ModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_torch, slow, torch_device @@ -32,7 +34,7 @@ if is_torch_available(): @require_torch -class GPT2ModelTest(CommonTestCases.CommonModelTester): +class GPT2ModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else () diff --git a/tests/test_modeling_openai.py b/tests/test_modeling_openai.py index d9b2d0f228..ccc187357f 100644 --- a/tests/test_modeling_openai.py +++ b/tests/test_modeling_openai.py @@ -14,10 +14,12 @@ # limitations under the License. from __future__ import absolute_import, division, print_function +import unittest + from transformers import is_torch_available from .test_configuration_common import ConfigTester -from .test_modeling_common import CommonTestCases, ids_tensor +from .test_modeling_common import ModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_torch, slow, torch_device @@ -32,7 +34,7 @@ if is_torch_available(): @require_torch -class OpenAIGPTModelTest(CommonTestCases.CommonModelTester): +class OpenAIGPTModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = ( (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else () diff --git a/tests/test_modeling_roberta.py b/tests/test_modeling_roberta.py index 61d2055aea..1994405e07 100644 --- a/tests/test_modeling_roberta.py +++ b/tests/test_modeling_roberta.py @@ -19,7 +19,7 @@ import unittest from transformers import is_torch_available from .test_configuration_common import ConfigTester -from .test_modeling_common import CommonTestCases, ids_tensor +from .test_modeling_common import ModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_torch, slow, torch_device @@ -37,7 +37,7 @@ if is_torch_available(): @require_torch -class RobertaModelTest(CommonTestCases.CommonModelTester): +class RobertaModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else () diff --git a/tests/test_modeling_t5.py b/tests/test_modeling_t5.py index f12fa10d3b..4c58a67972 100644 --- a/tests/test_modeling_t5.py +++ b/tests/test_modeling_t5.py @@ -14,10 +14,12 @@ # limitations under the License. from __future__ import absolute_import, division, print_function +import unittest + from transformers import is_torch_available from .test_configuration_common import ConfigTester -from .test_modeling_common import CommonTestCases, ids_tensor +from .test_modeling_common import ModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_torch, slow @@ -27,7 +29,7 @@ if is_torch_available(): @require_torch -class T5ModelTest(CommonTestCases.CommonModelTester): +class T5ModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (T5Model, T5WithLMHeadModel) if is_torch_available() else () test_pruning = False diff --git a/tests/test_modeling_tf_albert.py b/tests/test_modeling_tf_albert.py index 3761c3252f..130d0de8b1 100644 --- a/tests/test_modeling_tf_albert.py +++ b/tests/test_modeling_tf_albert.py @@ -14,10 +14,12 @@ # limitations under the License. from __future__ import absolute_import, division, print_function +import unittest + from transformers import AlbertConfig, is_tf_available from .test_configuration_common import ConfigTester -from .test_modeling_tf_common import TFCommonTestCases, ids_tensor +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_tf, slow @@ -31,7 +33,7 @@ if is_tf_available(): @require_tf -class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester): +class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase): all_model_classes = ( (TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification) if is_tf_available() else () diff --git a/tests/test_modeling_tf_bert.py b/tests/test_modeling_tf_bert.py index d93fd133ee..140a4f4db2 100644 --- a/tests/test_modeling_tf_bert.py +++ b/tests/test_modeling_tf_bert.py @@ -14,10 +14,12 @@ # limitations under the License. from __future__ import absolute_import, division, print_function +import unittest + from transformers import BertConfig, is_tf_available from .test_configuration_common import ConfigTester -from .test_modeling_tf_common import TFCommonTestCases, ids_tensor +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_tf, slow @@ -36,7 +38,7 @@ if is_tf_available(): @require_tf -class TFBertModelTest(TFCommonTestCases.TFCommonModelTester): +class TFBertModelTest(TFModelTesterMixin, unittest.TestCase): all_model_classes = ( ( diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index 868eb2d9e0..83f089eda9 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -20,7 +20,6 @@ import random import shutil import sys import tempfile -import unittest from transformers import is_tf_available, is_torch_available @@ -59,307 +58,300 @@ def _config_zero_init(config): return configs_no_init -class TFCommonTestCases: - @require_tf - class TFCommonModelTester(unittest.TestCase): +@require_tf +class TFModelTesterMixin: - model_tester = None - all_model_classes = () - test_torchscript = True - test_pruning = True - test_resize_embeddings = True - is_encoder_decoder = False + model_tester = None + all_model_classes = () + test_torchscript = True + test_pruning = True + test_resize_embeddings = True + is_encoder_decoder = False - def test_initialization(self): - pass - # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + def test_initialization(self): + pass + # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - # configs_no_init = _config_zero_init(config) - # for model_class in self.all_model_classes: - # model = model_class(config=configs_no_init) - # for name, param in model.named_parameters(): - # if param.requires_grad: - # self.assertIn(param.data.mean().item(), [0.0, 1.0], - # msg="Parameter {} of model {} seems not properly initialized".format(name, model_class)) + # configs_no_init = _config_zero_init(config) + # for model_class in self.all_model_classes: + # model = model_class(config=configs_no_init) + # for name, param in model.named_parameters(): + # if param.requires_grad: + # self.assertIn(param.data.mean().item(), [0.0, 1.0], + # msg="Parameter {} of model {} seems not properly initialized".format(name, model_class)) - def test_save_load(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + def test_save_load(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: - model = model_class(config) - outputs = model(inputs_dict) + for model_class in self.all_model_classes: + model = model_class(config) + outputs = model(inputs_dict) - with TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname) - after_outputs = model(inputs_dict) + with TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained(tmpdirname) + after_outputs = model(inputs_dict) - # Make sure we don't have nans - out_1 = after_outputs[0].numpy() - out_2 = outputs[0].numpy() - out_1 = out_1[~np.isnan(out_1)] - out_2 = out_2[~np.isnan(out_2)] - max_diff = np.amax(np.abs(out_1 - out_2)) - self.assertLessEqual(max_diff, 1e-5) - - def test_pt_tf_model_equivalence(self): - if not is_torch_available(): - return - - import torch - import transformers - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - pt_model_class_name = model_class.__name__[2:] # Skip the "TF" at the beggining - pt_model_class = getattr(transformers, pt_model_class_name) - - config.output_hidden_states = True - tf_model = model_class(config) - pt_model = pt_model_class(config) - - # Check we can load pt model in tf and vice-versa with model => model functions - tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=inputs_dict) - pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model) - - # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences - pt_model.eval() - pt_inputs_dict = dict( - (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items() - ) - with torch.no_grad(): - pto = pt_model(**pt_inputs_dict) - tfo = tf_model(inputs_dict, training=False) - tf_hidden_states = tfo[0].numpy() - pt_hidden_states = pto[0].numpy() - tf_hidden_states[np.isnan(tf_hidden_states)] = 0 - pt_hidden_states[np.isnan(pt_hidden_states)] = 0 - max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states)) - self.assertLessEqual(max_diff, 2e-2) - - # Check we can load pt model in tf and vice-versa with checkpoint => model functions - with TemporaryDirectory() as tmpdirname: - pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") - torch.save(pt_model.state_dict(), pt_checkpoint_path) - tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path) - - tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") - tf_model.save_weights(tf_checkpoint_path) - pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path) - - # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences - pt_model.eval() - pt_inputs_dict = dict( - (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items() - ) - with torch.no_grad(): - pto = pt_model(**pt_inputs_dict) - tfo = tf_model(inputs_dict) - tfo = tfo[0].numpy() - pto = pto[0].numpy() - tfo[np.isnan(tfo)] = 0 - pto[np.isnan(pto)] = 0 - max_diff = np.amax(np.abs(tfo - pto)) - self.assertLessEqual(max_diff, 2e-2) - - def test_compile_tf_model(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - if self.is_encoder_decoder: - input_ids = { - "decoder_input_ids": tf.keras.Input( - batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32" - ), - "encoder_input_ids": tf.keras.Input( - batch_shape=(2, 2000), name="encoder_input_ids", dtype="int32" - ), - } - else: - input_ids = tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32") - optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) - loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") - - for model_class in self.all_model_classes: - # Prepare our model - model = model_class(config) - - # Let's load it from the disk to be sure we can use pretrained weights - with TemporaryDirectory() as tmpdirname: - outputs = model(inputs_dict) # build the model - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname) - - outputs_dict = model(input_ids) - hidden_states = outputs_dict[0] - - # Add a dense layer on top to test intetgration with other keras modules - outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states) - - # Compile extended model - extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs]) - extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) - - def test_keyword_and_dict_args(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - outputs_dict = model(inputs_dict) - - inputs_keywords = copy.deepcopy(inputs_dict) - input_ids = inputs_keywords.pop( - "input_ids" if not self.is_encoder_decoder else "decoder_input_ids", None - ) - outputs_keywords = model(input_ids, **inputs_keywords) - - output_dict = outputs_dict[0].numpy() - output_keywords = outputs_keywords[0].numpy() - - self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6) - - def test_attention_outputs(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - decoder_seq_length = ( - self.model_tester.decoder_seq_length - if hasattr(self.model_tester, "decoder_seq_length") - else self.model_tester.seq_length - ) - encoder_seq_length = ( - self.model_tester.encoder_seq_length - if hasattr(self.model_tester, "encoder_seq_length") - else self.model_tester.seq_length - ) - decoder_key_length = ( - self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length - ) - encoder_key_length = ( - self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length - ) - - for model_class in self.all_model_classes: - config.output_attentions = True - config.output_hidden_states = False - model = model_class(config) - outputs = model(inputs_dict) - attentions = [t.numpy() for t in outputs[-1]] - self.assertEqual(model.config.output_attentions, True) - self.assertEqual(model.config.output_hidden_states, False) - self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) - self.assertListEqual( - list(attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], - ) - out_len = len(outputs) - - if self.is_encoder_decoder: - self.assertEqual(out_len % 2, 0) - decoder_attentions = outputs[(out_len // 2) - 1] - self.assertEqual(model.config.output_attentions, True) - self.assertEqual(model.config.output_hidden_states, False) - self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) - self.assertListEqual( - list(decoder_attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], - ) - - # Check attention is always last and order is fine - config.output_attentions = True - config.output_hidden_states = True - model = model_class(config) - outputs = model(inputs_dict) - self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs)) - self.assertEqual(model.config.output_attentions, True) - self.assertEqual(model.config.output_hidden_states, True) - - attentions = [t.numpy() for t in outputs[-1]] - self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) - self.assertListEqual( - list(attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], - ) - - def test_hidden_states_output(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - config.output_hidden_states = True - config.output_attentions = False - model = model_class(config) - outputs = model(inputs_dict) - hidden_states = [t.numpy() for t in outputs[-1]] - self.assertEqual(model.config.output_attentions, False) - self.assertEqual(model.config.output_hidden_states, True) - self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1) - self.assertListEqual( - list(hidden_states[0].shape[-2:]), [self.model_tester.seq_length, self.model_tester.hidden_size] - ) - - def test_model_common_attributes(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer) - x = model.get_output_embeddings() - assert x is None or isinstance(x, tf.keras.layers.Layer) - - def test_determinism(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0] - out_1 = first.numpy() - out_2 = second.numpy() + # Make sure we don't have nans + out_1 = after_outputs[0].numpy() + out_2 = outputs[0].numpy() out_1 = out_1[~np.isnan(out_1)] out_2 = out_2[~np.isnan(out_2)] max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5) - def _get_embeds(self, wte, input_ids): - # ^^ In our TF models, the input_embeddings can take slightly different forms, - # so we try a few of them. - # We used to fall back to just synthetically creating a dummy tensor of ones: + def test_pt_tf_model_equivalence(self): + if not is_torch_available(): + return + + import torch + import transformers + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + pt_model_class_name = model_class.__name__[2:] # Skip the "TF" at the beggining + pt_model_class = getattr(transformers, pt_model_class_name) + + config.output_hidden_states = True + tf_model = model_class(config) + pt_model = pt_model_class(config) + + # Check we can load pt model in tf and vice-versa with model => model functions + tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=inputs_dict) + pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model) + + # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences + pt_model.eval() + pt_inputs_dict = dict( + (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items() + ) + with torch.no_grad(): + pto = pt_model(**pt_inputs_dict) + tfo = tf_model(inputs_dict, training=False) + tf_hidden_states = tfo[0].numpy() + pt_hidden_states = pto[0].numpy() + tf_hidden_states[np.isnan(tf_hidden_states)] = 0 + pt_hidden_states[np.isnan(pt_hidden_states)] = 0 + max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states)) + self.assertLessEqual(max_diff, 2e-2) + + # Check we can load pt model in tf and vice-versa with checkpoint => model functions + with TemporaryDirectory() as tmpdirname: + pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") + torch.save(pt_model.state_dict(), pt_checkpoint_path) + tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path) + + tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") + tf_model.save_weights(tf_checkpoint_path) + pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path) + + # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences + pt_model.eval() + pt_inputs_dict = dict( + (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items() + ) + with torch.no_grad(): + pto = pt_model(**pt_inputs_dict) + tfo = tf_model(inputs_dict) + tfo = tfo[0].numpy() + pto = pto[0].numpy() + tfo[np.isnan(tfo)] = 0 + pto[np.isnan(pto)] = 0 + max_diff = np.amax(np.abs(tfo - pto)) + self.assertLessEqual(max_diff, 2e-2) + + def test_compile_tf_model(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + if self.is_encoder_decoder: + input_ids = { + "decoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"), + "encoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="encoder_input_ids", dtype="int32"), + } + else: + input_ids = tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32") + optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) + loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) + metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") + + for model_class in self.all_model_classes: + # Prepare our model + model = model_class(config) + + # Let's load it from the disk to be sure we can use pretrained weights + with TemporaryDirectory() as tmpdirname: + outputs = model(inputs_dict) # build the model + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained(tmpdirname) + + outputs_dict = model(input_ids) + hidden_states = outputs_dict[0] + + # Add a dense layer on top to test intetgration with other keras modules + outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states) + + # Compile extended model + extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs]) + extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) + + def test_keyword_and_dict_args(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + outputs_dict = model(inputs_dict) + + inputs_keywords = copy.deepcopy(inputs_dict) + input_ids = inputs_keywords.pop("input_ids" if not self.is_encoder_decoder else "decoder_input_ids", None) + outputs_keywords = model(input_ids, **inputs_keywords) + + output_dict = outputs_dict[0].numpy() + output_keywords = outputs_keywords[0].numpy() + + self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + decoder_seq_length = ( + self.model_tester.decoder_seq_length + if hasattr(self.model_tester, "decoder_seq_length") + else self.model_tester.seq_length + ) + encoder_seq_length = ( + self.model_tester.encoder_seq_length + if hasattr(self.model_tester, "encoder_seq_length") + else self.model_tester.seq_length + ) + decoder_key_length = ( + self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length + ) + encoder_key_length = ( + self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length + ) + + for model_class in self.all_model_classes: + config.output_attentions = True + config.output_hidden_states = False + model = model_class(config) + outputs = model(inputs_dict) + attentions = [t.numpy() for t in outputs[-1]] + self.assertEqual(model.config.output_attentions, True) + self.assertEqual(model.config.output_hidden_states, False) + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) + out_len = len(outputs) + + if self.is_encoder_decoder: + self.assertEqual(out_len % 2, 0) + decoder_attentions = outputs[(out_len // 2) - 1] + self.assertEqual(model.config.output_attentions, True) + self.assertEqual(model.config.output_hidden_states, False) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], + ) + + # Check attention is always last and order is fine + config.output_attentions = True + config.output_hidden_states = True + model = model_class(config) + outputs = model(inputs_dict) + self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs)) + self.assertEqual(model.config.output_attentions, True) + self.assertEqual(model.config.output_hidden_states, True) + + attentions = [t.numpy() for t in outputs[-1]] + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) + + def test_hidden_states_output(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + config.output_hidden_states = True + config.output_attentions = False + model = model_class(config) + outputs = model(inputs_dict) + hidden_states = [t.numpy() for t in outputs[-1]] + self.assertEqual(model.config.output_attentions, False) + self.assertEqual(model.config.output_hidden_states, True) + self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1) + self.assertListEqual( + list(hidden_states[0].shape[-2:]), [self.model_tester.seq_length, self.model_tester.hidden_size] + ) + + def test_model_common_attributes(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer) + x = model.get_output_embeddings() + assert x is None or isinstance(x, tf.keras.layers.Layer) + + def test_determinism(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0] + out_1 = first.numpy() + out_2 = second.numpy() + out_1 = out_1[~np.isnan(out_1)] + out_2 = out_2[~np.isnan(out_2)] + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + def _get_embeds(self, wte, input_ids): + # ^^ In our TF models, the input_embeddings can take slightly different forms, + # so we try a few of them. + # We used to fall back to just synthetically creating a dummy tensor of ones: + try: + x = wte(input_ids, mode="embedding") + except Exception: try: - x = wte(input_ids, mode="embedding") + x = wte([input_ids], mode="embedding") except Exception: try: - x = wte([input_ids], mode="embedding") + x = wte([input_ids, None, None, None], mode="embedding") except Exception: - try: - x = wte([input_ids, None, None, None], mode="embedding") - except Exception: - if hasattr(self.model_tester, "embedding_size"): - x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32) - else: - x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32) - return x + if hasattr(self.model_tester, "embedding_size"): + x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32) + else: + x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32) + return x - def test_inputs_embeds(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + def test_inputs_embeds(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + if not self.is_encoder_decoder: + input_ids = inputs_dict["input_ids"] + del inputs_dict["input_ids"] + else: + encoder_input_ids = inputs_dict["encoder_input_ids"] + decoder_input_ids = inputs_dict["decoder_input_ids"] + del inputs_dict["encoder_input_ids"] + del inputs_dict["decoder_input_ids"] + + for model_class in self.all_model_classes: + model = model_class(config) + + wte = model.get_input_embeddings() if not self.is_encoder_decoder: - input_ids = inputs_dict["input_ids"] - del inputs_dict["input_ids"] + inputs_dict["inputs_embeds"] = self._get_embeds(wte, input_ids) else: - encoder_input_ids = inputs_dict["encoder_input_ids"] - decoder_input_ids = inputs_dict["decoder_input_ids"] - del inputs_dict["encoder_input_ids"] - del inputs_dict["decoder_input_ids"] + inputs_dict["encoder_inputs_embeds"] = self._get_embeds(wte, encoder_input_ids) + inputs_dict["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids) - for model_class in self.all_model_classes: - model = model_class(config) - - wte = model.get_input_embeddings() - if not self.is_encoder_decoder: - inputs_dict["inputs_embeds"] = self._get_embeds(wte, input_ids) - else: - inputs_dict["encoder_inputs_embeds"] = self._get_embeds(wte, encoder_input_ids) - inputs_dict["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids) - - outputs = model(inputs_dict) + outputs = model(inputs_dict) def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None): diff --git a/tests/test_modeling_tf_ctrl.py b/tests/test_modeling_tf_ctrl.py index 02845e60b5..4c41f417cc 100644 --- a/tests/test_modeling_tf_ctrl.py +++ b/tests/test_modeling_tf_ctrl.py @@ -14,10 +14,12 @@ # limitations under the License. from __future__ import absolute_import, division, print_function +import unittest + from transformers import CTRLConfig, is_tf_available from .test_configuration_common import ConfigTester -from .test_modeling_tf_common import TFCommonTestCases, ids_tensor +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_tf, slow @@ -26,7 +28,7 @@ if is_tf_available(): @require_tf -class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester): +class TFCTRLModelTest(TFModelTesterMixin, unittest.TestCase): all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else () diff --git a/tests/test_modeling_tf_distilbert.py b/tests/test_modeling_tf_distilbert.py index 3fa1ea849b..7dd0ad23e2 100644 --- a/tests/test_modeling_tf_distilbert.py +++ b/tests/test_modeling_tf_distilbert.py @@ -14,10 +14,12 @@ # limitations under the License. from __future__ import absolute_import, division, print_function +import unittest + from transformers import DistilBertConfig, is_tf_available from .test_configuration_common import ConfigTester -from .test_modeling_tf_common import TFCommonTestCases, ids_tensor +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor from .utils import require_tf @@ -31,7 +33,7 @@ if is_tf_available(): @require_tf -class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester): +class TFDistilBertModelTest(TFModelTesterMixin, unittest.TestCase): all_model_classes = ( ( diff --git a/tests/test_modeling_tf_gpt2.py b/tests/test_modeling_tf_gpt2.py index e97462258e..8c718a34f6 100644 --- a/tests/test_modeling_tf_gpt2.py +++ b/tests/test_modeling_tf_gpt2.py @@ -14,10 +14,12 @@ # limitations under the License. from __future__ import absolute_import, division, print_function +import unittest + from transformers import GPT2Config, is_tf_available from .test_configuration_common import ConfigTester -from .test_modeling_tf_common import TFCommonTestCases, ids_tensor +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_tf, slow @@ -32,7 +34,7 @@ if is_tf_available(): @require_tf -class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester): +class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase): all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel) if is_tf_available() else () # all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel) if is_tf_available() else () diff --git a/tests/test_modeling_tf_openai_gpt.py b/tests/test_modeling_tf_openai_gpt.py index ea463c0e4f..3a624f1252 100644 --- a/tests/test_modeling_tf_openai_gpt.py +++ b/tests/test_modeling_tf_openai_gpt.py @@ -14,10 +14,12 @@ # limitations under the License. from __future__ import absolute_import, division, print_function +import unittest + from transformers import OpenAIGPTConfig, is_tf_available from .test_configuration_common import ConfigTester -from .test_modeling_tf_common import TFCommonTestCases, ids_tensor +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_tf, slow @@ -32,7 +34,7 @@ if is_tf_available(): @require_tf -class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester): +class TFOpenAIGPTModelTest(TFModelTesterMixin, unittest.TestCase): all_model_classes = ( (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel) if is_tf_available() else () diff --git a/tests/test_modeling_tf_roberta.py b/tests/test_modeling_tf_roberta.py index 6318eeaabf..d924c5f5d7 100644 --- a/tests/test_modeling_tf_roberta.py +++ b/tests/test_modeling_tf_roberta.py @@ -19,7 +19,7 @@ import unittest from transformers import RobertaConfig, is_tf_available from .test_configuration_common import ConfigTester -from .test_modeling_tf_common import TFCommonTestCases, ids_tensor +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_tf, slow @@ -36,7 +36,7 @@ if is_tf_available(): @require_tf -class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester): +class TFRobertaModelTest(TFModelTesterMixin, unittest.TestCase): all_model_classes = ( (TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification) if is_tf_available() else () diff --git a/tests/test_modeling_tf_t5.py b/tests/test_modeling_tf_t5.py index e18213c0b5..69272addda 100644 --- a/tests/test_modeling_tf_t5.py +++ b/tests/test_modeling_tf_t5.py @@ -14,10 +14,12 @@ # limitations under the License. from __future__ import absolute_import, division, print_function +import unittest + from transformers import T5Config, is_tf_available from .test_configuration_common import ConfigTester -from .test_modeling_tf_common import TFCommonTestCases, ids_tensor +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_tf, slow @@ -26,7 +28,7 @@ if is_tf_available(): @require_tf -class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester): +class TFT5ModelTest(TFModelTesterMixin, unittest.TestCase): is_encoder_decoder = True all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if is_tf_available() else () diff --git a/tests/test_modeling_tf_transfo_xl.py b/tests/test_modeling_tf_transfo_xl.py index bc2d616fe2..62f81c45a4 100644 --- a/tests/test_modeling_tf_transfo_xl.py +++ b/tests/test_modeling_tf_transfo_xl.py @@ -15,11 +15,12 @@ from __future__ import absolute_import, division, print_function import random +import unittest from transformers import TransfoXLConfig, is_tf_available from .test_configuration_common import ConfigTester -from .test_modeling_tf_common import TFCommonTestCases, ids_tensor +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_tf, slow @@ -33,7 +34,7 @@ if is_tf_available(): @require_tf -class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester): +class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase): all_model_classes = (TFTransfoXLModel, TFTransfoXLLMHeadModel) if is_tf_available() else () test_pruning = False diff --git a/tests/test_modeling_tf_xlm.py b/tests/test_modeling_tf_xlm.py index 0ee924ab61..e62bae66c8 100644 --- a/tests/test_modeling_tf_xlm.py +++ b/tests/test_modeling_tf_xlm.py @@ -14,10 +14,12 @@ # limitations under the License. from __future__ import absolute_import, division, print_function +import unittest + from transformers import is_tf_available from .test_configuration_common import ConfigTester -from .test_modeling_tf_common import TFCommonTestCases, ids_tensor +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_tf, slow @@ -34,7 +36,7 @@ if is_tf_available(): @require_tf -class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester): +class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase): all_model_classes = ( (TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple) diff --git a/tests/test_modeling_tf_xlnet.py b/tests/test_modeling_tf_xlnet.py index 0dcd8a776a..7b7570f889 100644 --- a/tests/test_modeling_tf_xlnet.py +++ b/tests/test_modeling_tf_xlnet.py @@ -15,11 +15,12 @@ from __future__ import absolute_import, division, print_function import random +import unittest from transformers import XLNetConfig, is_tf_available from .test_configuration_common import ConfigTester -from .test_modeling_tf_common import TFCommonTestCases, ids_tensor +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_tf, slow @@ -37,7 +38,7 @@ if is_tf_available(): @require_tf -class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester): +class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase): all_model_classes = ( ( diff --git a/tests/test_modeling_transfo_xl.py b/tests/test_modeling_transfo_xl.py index 76886a83ad..1e18aeb09b 100644 --- a/tests/test_modeling_transfo_xl.py +++ b/tests/test_modeling_transfo_xl.py @@ -15,11 +15,12 @@ from __future__ import absolute_import, division, print_function import random +import unittest from transformers import is_torch_available from .test_configuration_common import ConfigTester -from .test_modeling_common import CommonTestCases, ids_tensor +from .test_modeling_common import ModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_torch, slow, torch_device @@ -30,7 +31,7 @@ if is_torch_available(): @require_torch -class TransfoXLModelTest(CommonTestCases.CommonModelTester): +class TransfoXLModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel) if is_torch_available() else () test_pruning = False diff --git a/tests/test_modeling_xlm.py b/tests/test_modeling_xlm.py index c921a4312f..22897249aa 100644 --- a/tests/test_modeling_xlm.py +++ b/tests/test_modeling_xlm.py @@ -14,10 +14,12 @@ # limitations under the License. from __future__ import absolute_import, division, print_function +import unittest + from transformers import is_torch_available from .test_configuration_common import ConfigTester -from .test_modeling_common import CommonTestCases, ids_tensor +from .test_modeling_common import ModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_torch, slow, torch_device @@ -34,7 +36,7 @@ if is_torch_available(): @require_torch -class XLMModelTest(CommonTestCases.CommonModelTester): +class XLMModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = ( ( diff --git a/tests/test_modeling_xlnet.py b/tests/test_modeling_xlnet.py index c295b79b28..e095a31e30 100644 --- a/tests/test_modeling_xlnet.py +++ b/tests/test_modeling_xlnet.py @@ -15,11 +15,12 @@ from __future__ import absolute_import, division, print_function import random +import unittest from transformers import is_torch_available from .test_configuration_common import ConfigTester -from .test_modeling_common import CommonTestCases, ids_tensor +from .test_modeling_common import ModelTesterMixin, ids_tensor from .utils import CACHE_DIR, require_torch, slow, torch_device @@ -38,7 +39,7 @@ if is_torch_available(): @require_torch -class XLNetModelTest(CommonTestCases.CommonModelTester): +class XLNetModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = ( (