Add head_mask/decoder_head_mask for BART (#9569)

* Add head_mask/decoder_head_mask for BART This branch implement head_mask and decoder_head_mask for BART-based models. Full list below: - BART - MBart - Blenderbot - BlenderbotSmall - Marian - Pegasus Everything is accompanied with updated testing. * Fix test_headmasking for BART models * Fix text_headmasking for BART-like models which has only 2 layers in each modules. The condition ``` self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0) ``` is, therefore, invalid for encoder-decoder models considering the `head_mask` ``` head_mask = torch.ones( self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device, ) head_mask[0, 0] = 0 head_mask[-1, :-1] = 0 ``` specified in the `test_headmasking` test/function. * Adjust test_modeling_common.py to reflect T5 input args * Update tests/test_modeling_common.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * make style * make fix-copies Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2021-01-18 13:35:22 +01:00
parent 65eb5d9ac5
commit 357fb1c5d8
13 changed files with 735 additions and 60 deletions
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -204,9 +204,13 @@ class ModelTesterMixin:
                    "attention_mask",
                    "decoder_input_ids",
                    "decoder_attention_mask",
-                    "encoder_outputs",
                ]
-                self.assertListEqual(arg_names[:5], expected_arg_names)
+                expected_arg_names.extend(
+                    ["head_mask", "decoder_head_mask", "encoder_outputs"]
+                    if "head_mask" and "decoder_head_mask" in arg_names
+                    else ["encoder_outputs"]
+                )
+                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
            else:
                expected_arg_names = ["input_ids"]
                self.assertListEqual(arg_names[:1], expected_arg_names)
@@ -395,7 +399,6 @@ class ModelTesterMixin:
                    attention_mask = inputs["attention_mask"]
                    decoder_input_ids = inputs["decoder_input_ids"]
                    decoder_attention_mask = inputs["decoder_attention_mask"]
-
                    traced_model = torch.jit.trace(
                        model, (input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
                    )
@@ -465,6 +468,11 @@ class ModelTesterMixin:
            head_mask.requires_grad_(requires_grad=True)
            inputs = self._prepare_for_class(inputs_dict, model_class).copy()
            inputs["head_mask"] = head_mask
+            if model.config.is_encoder_decoder:
+                signature = inspect.signature(model.forward)
+                arg_names = [*signature.parameters.keys()]
+                if "decoder_head_mask" in arg_names:  # necessary diferentiation because of T5 model
+                    inputs["decoder_head_mask"] = head_mask

            outputs = model(**inputs, return_dict=True)

@@ -474,24 +482,31 @@ class ModelTesterMixin:
            output.backward()
            multihead_outputs = head_mask.grad

-            attentions = outputs[-1]
-
-            # Remove Nan
-            for t in attentions:
-                self.assertLess(
-                    torch.sum(torch.isnan(t)), t.numel() / 4
-                )  # Check we don't have more than 25% nans (arbitrary)
-            attentions = [
-                t.masked_fill(torch.isnan(t), 0.0) for t in attentions
-            ]  # remove them (the test is less complete)
-
            self.assertIsNotNone(multihead_outputs)
            self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
-            self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
-            self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
-            self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
-            self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
-            self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
+
+            def check_attentions_validity(attentions):
+                # Remove Nan
+                for t in attentions:
+                    self.assertLess(
+                        torch.sum(torch.isnan(t)), t.numel() / 4
+                    )  # Check we don't have more than 25% nans (arbitrary)
+                attentions = [
+                    t.masked_fill(torch.isnan(t), 0.0) for t in attentions
+                ]  # remove them (the test is less complete)
+
+                self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
+                if len(attentions) > 2:  # encoder-decoder models have only 2 layers in each module
+                    self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
+                self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
+
+            if model.config.is_encoder_decoder:
+                check_attentions_validity(outputs.encoder_attentions)
+                check_attentions_validity(outputs.decoder_attentions)
+            else:
+                check_attentions_validity(outputs.attentions)

    def test_head_pruning(self):
        if not self.test_pruning: