Add head_mask/decoder_head_mask for BART (#9569)

* Add head_mask/decoder_head_mask for BART This branch implement head_mask and decoder_head_mask for BART-based models. Full list below: - BART - MBart - Blenderbot - BlenderbotSmall - Marian - Pegasus Everything is accompanied with updated testing. * Fix test_headmasking for BART models * Fix text_headmasking for BART-like models which has only 2 layers in each modules. The condition ``` self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0) ``` is, therefore, invalid for encoder-decoder models considering the `head_mask` ``` head_mask = torch.ones( self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device, ) head_mask[0, 0] = 0 head_mask[-1, :-1] = 0 ``` specified in the `test_headmasking` test/function. * Adjust test_modeling_common.py to reflect T5 input args * Update tests/test_modeling_common.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * make style * make fix-copies Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2021-01-18 13:35:22 +01:00
parent 65eb5d9ac5
commit 357fb1c5d8
13 changed files with 735 additions and 60 deletions
--- a/tests/test_modeling_blenderbot.py
+++ b/tests/test_modeling_blenderbot.py
@@ -40,16 +40,24 @@ def prepare_blenderbot_inputs_dict(
    decoder_input_ids,
    attention_mask=None,
    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
 ):
    if attention_mask is None:
        attention_mask = input_ids.ne(config.pad_token_id)
    if decoder_attention_mask is None:
        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads)
    return {
        "input_ids": input_ids,
        "decoder_input_ids": decoder_input_ids,
        "attention_mask": attention_mask,
        "decoder_attention_mask": attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
    }


@@ -129,9 +137,10 @@ class BlenderbotModelTester:
        model = BlenderbotModel(config=config).get_decoder().to(torch_device).eval()
        input_ids = inputs_dict["input_ids"]
        attention_mask = inputs_dict["attention_mask"]
+        head_mask = inputs_dict["head_mask"]

        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)

        output, past_key_values = outputs.to_tuple()

@@ -197,7 +206,7 @@ class BlenderbotModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
    all_generative_model_classes = (BlenderbotForConditionalGeneration,) if is_torch_available() else ()
    is_encoder_decoder = True
    test_pruning = False
-    test_head_masking = False
+    test_head_masking = True
    test_missing_keys = False

    def setUp(self):