Fix cross-attention head mask for Torch encoder-decoder models (#10605)

* Fix cross-attention head mask for Torch BART models * Fix head masking for cross-attention module for the following models: BART, Blenderbot, Blenderbot_small, M2M_100, Marian, MBart, Pegasus * Enable test_headmasking for M2M_100 model * Fix cross_head_mask for FSMT, LED and T5 * This commit fixes `head_mask` for cross-attention modules in the following models: FSMT, LED, T5 * It also contains some smaller changes in doc so that it is be perfectly clear the shape of `cross_head_mask` is the same as of `decoder_head_mask` * Update template * Fix template for BartForCausalLM * Fix cross_head_mask for Speech2Text models * Fix cross_head_mask in templates * Fix args order in BartForCausalLM template * Fix doc in BART templates * Make more explicit naming * `cross_head_mask` -> `cross_attn_head_mask` * `cross_layer_head_mask` -> `cross_attn_layer_head_mask` * Fix doc * make style quality * Fix speech2text docstring
2021-04-23 18:58:06 +02:00
parent ca6b80cadb
commit e3ff165aa5
23 changed files with 587 additions and 389 deletions
--- a/tests/test_modeling_led.py
+++ b/tests/test_modeling_led.py
@@ -52,6 +52,7 @@ def prepare_led_inputs_dict(
    decoder_attention_mask=None,
    head_mask=None,
    decoder_head_mask=None,
+    cross_attn_head_mask=None,
 ):
    if attention_mask is None:
        attention_mask = input_ids.ne(config.pad_token_id)
@@ -61,6 +62,8 @@ def prepare_led_inputs_dict(
        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
    if decoder_head_mask is None:
        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
    return {
        "input_ids": input_ids,
        "decoder_input_ids": decoder_input_ids,
@@ -68,6 +71,7 @@ def prepare_led_inputs_dict(
        "decoder_attention_mask": decoder_attention_mask,
        "head_mask": head_mask,
        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
    }