From 61cf2ea9c0c3f76c2a06a36844506153849e81b0 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Thu, 7 Oct 2021 17:30:15 +0100
Subject: [PATCH] Fix incorrect output shapes for TF/PT LED (#13882)

* Fix issues with LED model

* Style pass

* Bugfixes

* correct attentions as well

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/led/modeling_led.py   |  5 ++++
 .../models/led/modeling_tf_led.py             | 10 +++++--
 tests/test_modeling_led.py                    | 30 +------------------
 3 files changed, 13 insertions(+), 32 deletions(-)

diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index 863e96c053..1e08899c46 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -1858,6 +1858,11 @@ class LEDEncoder(LEDPreTrainedModel):
         if padding_len > 0:
             # unpad `hidden_states` because the calling function is expecting a length == input_ids.size(1)
             hidden_states = hidden_states[:, :-padding_len]
+            if output_hidden_states:
+                encoder_states = tuple([state[:, :-padding_len] for state in encoder_states])
+
+            if output_attentions:
+                all_attentions = tuple([state[:, :, :-padding_len, :] for state in all_attentions])
 
         if not return_dict:
             return tuple(
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index 117738485f..d0d8afb1a0 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -1602,7 +1602,9 @@ class TFLEDEncoder(tf.keras.layers.Layer):
         super().__init__(**kwargs)
         self.config = config
         self.dropout = tf.keras.layers.Dropout(config.dropout)
-        self.layerdrop = config.encoder_layerdrop
+        if config.encoder_layerdrop > 0:
+            logger.warning("Layerdrop is currently disabled in TFLED models.")
+        self.layerdrop = 0.0
         self.padding_idx = config.pad_token_id
 
         if isinstance(config.attention_window, int):
@@ -1867,7 +1869,9 @@ class TFLEDDecoder(tf.keras.layers.Layer):
         self.config = config
         self.padding_idx = config.pad_token_id
         self.embed_tokens = embed_tokens
-        self.layerdrop = config.decoder_layerdrop
+        if config.decoder_layerdrop > 0:
+            logger.warning("Layerdrop is currently disabled in TFLED models.")
+        self.layerdrop = 0.0
         self.embed_positions = TFLEDLearnedPositionalEmbedding(
             config.max_decoder_position_embeddings,
             config.d_model,
@@ -2451,7 +2455,7 @@ class TFLEDForConditionalGeneration(TFLEDPreTrainedModel):
             past_key_values=outputs.past_key_values,  # index 1 of d outputs
             decoder_hidden_states=outputs.decoder_hidden_states,  # index 2 of d outputs
             decoder_attentions=outputs.decoder_attentions,  # index 3 of d outputs
-            encoder_last_hidden_state=outputs.last_hidden_state,  # index 0 of encoder outputs
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,  # index 0 of encoder outputs
             encoder_hidden_states=outputs.encoder_hidden_states,  # 1 of e out
             encoder_attentions=outputs.encoder_attentions,  # 2 of e out
             encoder_global_attentions=outputs.encoder_global_attentions,
diff --git a/tests/test_modeling_led.py b/tests/test_modeling_led.py
index cb0861acdb..db38604f00 100644
--- a/tests/test_modeling_led.py
+++ b/tests/test_modeling_led.py
@@ -126,9 +126,7 @@ class LEDModelTester:
 
         # because of padding `encoder_seq_length`, is different from `seq_length`. Relevant for
         # the `test_attention_outputs` and `test_hidden_states_output` tests
-        self.encoder_seq_length = (
-            self.seq_length + (self.attention_window - self.seq_length % self.attention_window) % self.attention_window
-        )
+        self.encoder_seq_length = self.seq_length
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -354,32 +352,6 @@ class LEDModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
         # longformer cannot keep gradients in attentions or hidden states
         return
 
-    def _check_encoder_attention_for_generate(self, attentions, batch_size, config, seq_length):
-        # make sure tgt_length is padded
-        tgt_length = (
-            seq_length // config.attention_window[0] + (seq_length % config.attention_window[0] != 0)
-        ) * config.attention_window[0]
-
-        encoder_expected_shape = (batch_size, config.num_attention_heads, tgt_length, seq_length)
-        self.assertIsInstance(attentions, tuple)
-        self.assertListEqual(
-            [layer_attentions.shape for layer_attentions in attentions],
-            [encoder_expected_shape] * len(attentions),
-        )
-
-    def _check_encoder_hidden_states_for_generate(self, hidden_states, batch_size, config, seq_length):
-        # make sure seq_length is padded
-        seq_length = (
-            seq_length // config.attention_window[0] + (seq_length % config.attention_window[0] != 0)
-        ) * config.attention_window[0]
-
-        encoder_expected_shape = (batch_size, seq_length, config.hidden_size)
-        self.assertIsInstance(hidden_states, tuple)
-        self.assertListEqual(
-            [layer_hidden_states.shape for layer_hidden_states in hidden_states],
-            [encoder_expected_shape] * len(hidden_states),
-        )
-
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True