Return correct Bart hidden state tensors (#8747)

* bart output hidden states upstream * same w/ decoder * add tests * fix prophetnet * fix gpt2 and ctrl * fix fstm and skip test for reformer and longformer * fix all models Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
2020-11-25 16:06:04 -05:00
parent 138f45c184
commit 369f1d77b4
14 changed files with 199 additions and 54 deletions
--- a/tests/test_modeling_prophetnet.py
+++ b/tests/test_modeling_prophetnet.py
@@ -1011,6 +1011,32 @@ class ProphetNetModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
                )

+    def test_retain_grad_hidden_states_attentions(self):
+        # decoder cannot keep gradients
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+        output = outputs[0]
+
+        encoder_hidden_states = outputs.encoder_hidden_states[0]
+        encoder_attentions = outputs.encoder_attentions[0]
+        encoder_hidden_states.retain_grad()
+        encoder_attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(encoder_hidden_states.grad)
+        self.assertIsNotNone(encoder_attentions.grad)
+

@require_torch
 class ProphetNetStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
@@ -1037,6 +1063,10 @@ class ProphetNetStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMix
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)

+    def test_retain_grad_hidden_states_attentions(self):
+        # decoder cannot keep gradients
+        return
+

@require_torch
 class ProphetNetStandaloneEncoderModelTest(ModelTesterMixin, unittest.TestCase):