Switch return_dict to True by default. (#8530)

* Use the CI to identify failing tests * Remove from all examples and tests * More default switch * Fixes * More test fixes * More fixes * Last fixes hopefully * Use the CI to identify failing tests * Remove from all examples and tests * More default switch * Fixes * More test fixes * More fixes * Last fixes hopefully * Run on the real suite * Fix slow tests
2020-11-16 11:43:00 -05:00
parent 0d0a0785fd
commit 1073a2bde5
106 changed files with 138 additions and 234 deletions
--- a/tests/test_generation_utils.py
+++ b/tests/test_generation_utils.py
@@ -118,7 +118,7 @@ class GenerationTesterMixin:
    @staticmethod
    def _get_encoder_outputs(model, input_ids, attention_mask, num_interleave=1):
        encoder = model.get_encoder()
-        encoder_outputs = encoder(input_ids, attention_mask=attention_mask, return_dict=True)
+        encoder_outputs = encoder(input_ids, attention_mask=attention_mask)
        encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.repeat_interleave(
            num_interleave, dim=0
        )
@@ -344,6 +344,7 @@ class GenerationTesterMixin:
    def test_beam_sample_generate(self):
        for model_class in self.all_generative_model_classes:
            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+            print("Return dict", config.return_dict)
            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)

            model = model_class(config).to(torch_device)
--- a/tests/test_modeling_albert.py
+++ b/tests/test_modeling_albert.py
@@ -102,7 +102,6 @@ class AlbertModelTester:
            type_vocab_size=self.type_vocab_size,
            initializer_range=self.initializer_range,
            num_hidden_groups=self.num_hidden_groups,
-            return_dict=True,
        )

        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -259,7 +259,6 @@ class BartHeadTests(unittest.TestCase):
            eos_token_id=2,
            pad_token_id=1,
            bos_token_id=0,
-            return_dict=True,
        )
        return config, input_ids, batch_size

@@ -310,7 +309,6 @@ class BartHeadTests(unittest.TestCase):
            encoder_ffn_dim=8,
            decoder_ffn_dim=8,
            max_position_embeddings=48,
-            return_dict=True,
        )
        lm_model = BartForConditionalGeneration(config).to(torch_device)
        context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long().to(torch_device)
@@ -713,6 +711,6 @@ class FastIntegrationTests(unittest.TestCase):
            padding="longest",
            truncation=True,
        )
-        features = self.xsum_1_1_model.get_encoder()(**batch, return_dict=True).last_hidden_state
+        features = self.xsum_1_1_model.get_encoder()(**batch).last_hidden_state
        expected = [[-0.0828, -0.0251, -0.0674], [0.1277, 0.3311, -0.0255], [0.2613, -0.0840, -0.2763]]
        assert_tensors_close(features[0, :3, :3], torch.tensor(expected), atol=1e-3)
--- a/tests/test_modeling_bert.py
+++ b/tests/test_modeling_bert.py
@@ -124,7 +124,6 @@ class BertModelTester:
            type_vocab_size=self.type_vocab_size,
            is_decoder=False,
            initializer_range=self.initializer_range,
-            return_dict=True,
        )

        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
--- a/tests/test_modeling_bert_generation.py
+++ b/tests/test_modeling_bert_generation.py
@@ -89,7 +89,6 @@ class BertGenerationEncoderTester:
            max_position_embeddings=self.max_position_embeddings,
            is_decoder=False,
            initializer_range=self.initializer_range,
-            return_dict=True,
        )

        return config, input_ids, input_mask, token_labels
--- a/tests/test_modeling_camembert.py
+++ b/tests/test_modeling_camembert.py
@@ -31,7 +31,7 @@ if is_torch_available():
 class CamembertModelIntegrationTest(unittest.TestCase):
    @slow
    def test_output_embeds_base_model(self):
-        model = CamembertModel.from_pretrained("camembert-base", return_dict=True)
+        model = CamembertModel.from_pretrained("camembert-base")
        model.to(torch_device)

        input_ids = torch.tensor(
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -657,7 +657,7 @@ class ModelTesterMixin:
            model.eval()

            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class), return_dict=True)
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
            hidden_states = outputs["hidden_states"] if "hidden_states" in outputs else outputs[-1]

            expected_num_layers = getattr(
--- a/tests/test_modeling_ctrl.py
+++ b/tests/test_modeling_ctrl.py
@@ -94,7 +94,6 @@ class CTRLModelTester:
            n_ctx=self.max_position_embeddings,
            # type_vocab_size=self.type_vocab_size,
            # initializer_range=self.initializer_range,
-            return_dict=True,
        )

        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
--- a/tests/test_modeling_deberta.py
+++ b/tests/test_modeling_deberta.py
@@ -148,7 +148,7 @@ class DebertaModelTest(ModelTesterMixin, unittest.TestCase):
            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels

        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
+            self.parent.assertListEqual(list(result.loss.size()), [])

        def create_and_check_deberta_model(
            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
@@ -160,11 +160,8 @@ class DebertaModelTest(ModelTesterMixin, unittest.TestCase):
            sequence_output = model(input_ids, token_type_ids=token_type_ids)[0]
            sequence_output = model(input_ids)[0]

-            result = {
-                "sequence_output": sequence_output,
-            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+                list(sequence_output.size()), [self.batch_size, self.seq_length, self.hidden_size]
            )

        def create_and_check_deberta_for_sequence_classification(
@@ -174,14 +171,8 @@ class DebertaModelTest(ModelTesterMixin, unittest.TestCase):
            model = DebertaForSequenceClassification(config)
            model.to(torch_device)
            model.eval()
-            loss, logits = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
-            )
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
+            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+            self.parent.assertListEqual(list(result.logits.size()), [self.batch_size, self.num_labels])
            self.check_loss_output(result)

        def prepare_config_and_inputs_for_common(self):
--- a/tests/test_modeling_distilbert.py
+++ b/tests/test_modeling_distilbert.py
@@ -110,7 +110,6 @@ if is_torch_available():
                attention_dropout=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
                initializer_range=self.initializer_range,
-                return_dict=True,
            )

            return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
--- a/tests/test_modeling_dpr.py
+++ b/tests/test_modeling_dpr.py
@@ -117,7 +117,6 @@ class DPRModelTester:
            type_vocab_size=self.type_vocab_size,
            is_decoder=False,
            initializer_range=self.initializer_range,
-            return_dict=True,
        )
        config = DPRConfig(projection_dim=self.projection_dim, **config.to_dict())

--- a/tests/test_modeling_electra.py
+++ b/tests/test_modeling_electra.py
@@ -101,7 +101,6 @@ class ElectraModelTester:
            type_vocab_size=self.type_vocab_size,
            is_decoder=False,
            initializer_range=self.initializer_range,
-            return_dict=True,
        )

        return (
--- a/tests/test_modeling_encoder_decoder.py
+++ b/tests/test_modeling_encoder_decoder.py
@@ -85,7 +85,6 @@ class EncoderDecoderMixin:
            decoder_input_ids=decoder_input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
-            return_dict=True,
        )

        self.assertEqual(
@@ -117,7 +116,6 @@ class EncoderDecoderMixin:
            decoder_input_ids=decoder_input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
-            return_dict=True,
        )
        self.assertEqual(
            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
@@ -132,7 +130,6 @@ class EncoderDecoderMixin:
            decoder_input_ids=decoder_input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
-            return_dict=True,
        )

        self.assertEqual(
@@ -278,7 +275,6 @@ class EncoderDecoderMixin:
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels,
-            return_dict=True,
        )

        loss = outputs_encoder_decoder["loss"]
@@ -313,7 +309,6 @@ class EncoderDecoderMixin:
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            output_attentions=True,
-            return_dict=True,
        )

        encoder_attentions = outputs_encoder_decoder["encoder_attentions"]
--- a/tests/test_modeling_flaubert.py
+++ b/tests/test_modeling_flaubert.py
@@ -113,7 +113,6 @@ class FlaubertModelTester(object):
            initializer_range=self.initializer_range,
            summary_type=self.summary_type,
            use_proj=self.use_proj,
-            return_dict=True,
        )

        return (
--- a/tests/test_modeling_flax_bert.py
+++ b/tests/test_modeling_flax_bert.py
@@ -29,7 +29,7 @@ class FlaxBertModelTest(unittest.TestCase):
                # Check for simple input
                pt_inputs = tokenizer.encode_plus("This is a simple input", return_tensors=TensorType.PYTORCH)
                fx_inputs = tokenizer.encode_plus("This is a simple input", return_tensors=TensorType.JAX)
-                pt_outputs = pt_model(**pt_inputs)
+                pt_outputs = pt_model(**pt_inputs).to_tuple()
                fx_outputs = fx_model(**fx_inputs)

                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
--- a/tests/test_modeling_flax_roberta.py
+++ b/tests/test_modeling_flax_roberta.py
@@ -34,7 +34,7 @@ class FlaxRobertaModelTest(unittest.TestCase):

                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")

-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
+                for fx_output, pt_output in zip(fx_outputs, pt_outputs.to_tuple()):
                    self.assert_almost_equals(fx_output, pt_output.numpy(), 5e-4)

    def assert_almost_equals(self, a: ndarray, b: ndarray, tol: float):
--- a/tests/test_modeling_fsmt.py
+++ b/tests/test_modeling_fsmt.py
@@ -259,7 +259,6 @@ class FSMTHeadTests(unittest.TestCase):
            eos_token_id=2,
            pad_token_id=1,
            bos_token_id=0,
-            return_dict=True,
        )

    def _get_config_and_data(self):
--- a/tests/test_modeling_funnel.py
+++ b/tests/test_modeling_funnel.py
@@ -140,7 +140,6 @@ class FunnelModelTester:
            activation_dropout=self.activation_dropout,
            max_position_embeddings=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
-            return_dict=True,
        )

        return (
--- a/tests/test_modeling_gpt2.py
+++ b/tests/test_modeling_gpt2.py
@@ -131,7 +131,6 @@ class GPT2ModelTester:
            bos_token_id=self.bos_token_id,
            eos_token_id=self.eos_token_id,
            pad_token_id=self.pad_token_id,
-            return_dict=True,
            gradient_checkpointing=gradient_checkpointing,
        )

--- a/tests/test_modeling_layoutlm.py
+++ b/tests/test_modeling_layoutlm.py
@@ -125,7 +125,6 @@ class LayoutLMModelTester:
            max_position_embeddings=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
            initializer_range=self.initializer_range,
-            return_dict=True,
        )

        return config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
--- a/tests/test_modeling_longformer.py
+++ b/tests/test_modeling_longformer.py
@@ -113,7 +113,6 @@ class LongformerModelTester:
            type_vocab_size=self.type_vocab_size,
            initializer_range=self.initializer_range,
            attention_window=self.attention_window,
-            return_dict=True,
        )

        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
--- a/tests/test_modeling_lxmert.py
+++ b/tests/test_modeling_lxmert.py
@@ -282,7 +282,6 @@ class LxmertModelTester:
            attention_mask=input_mask,
            labels=ans,
            output_attentions=output_attentions,
-            return_dict=True,
        )
        result = model(input_ids, visual_feats, bounding_boxes, labels=ans)
        result = model(
@@ -302,7 +301,6 @@ class LxmertModelTester:
            attention_mask=input_mask,
            labels=ans,
            output_attentions=not output_attentions,
-            return_dict=True,
        )

        self.parent.assertEqual(result.question_answering_score.shape, (self.batch_size, self.num_qa_labels))
@@ -335,7 +333,6 @@ class LxmertModelTester:
            matched_label=matched_label,
            ans=ans,
            output_attentions=output_attentions,
-            return_dict=True,
        )
        result = model(
            input_ids,
@@ -390,7 +387,6 @@ class LxmertModelTester:
            matched_label=matched_label,
            ans=ans,
            output_attentions=not output_attentions,
-            return_dict=True,
        )

        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
@@ -427,7 +423,6 @@ class LxmertModelTester:
            token_type_ids=token_type_ids,
            attention_mask=input_mask,
            ans=ans,
-            return_dict=True,
        )

        result_qa = model_qa(
@@ -437,7 +432,6 @@ class LxmertModelTester:
            labels=ans,
            token_type_ids=token_type_ids,
            attention_mask=input_mask,
-            return_dict=True,
        )

        model_pretrain.resize_num_qa_labels(num_small_labels)
@@ -450,7 +444,6 @@ class LxmertModelTester:
            token_type_ids=token_type_ids,
            attention_mask=input_mask,
            ans=less_labels_ans,
-            return_dict=True,
        )

        result_qa_less = model_qa(
@@ -460,7 +453,6 @@ class LxmertModelTester:
            labels=less_labels_ans,
            token_type_ids=token_type_ids,
            attention_mask=input_mask,
-            return_dict=True,
        )

        model_pretrain.resize_num_qa_labels(num_large_labels)
@@ -473,7 +465,6 @@ class LxmertModelTester:
            token_type_ids=token_type_ids,
            attention_mask=input_mask,
            ans=more_labels_ans,
-            return_dict=True,
        )

        result_qa_more = model_qa(
@@ -483,7 +474,6 @@ class LxmertModelTester:
            labels=more_labels_ans,
            token_type_ids=token_type_ids,
            attention_mask=input_mask,
-            return_dict=True,
        )

        model_qa_labels = model_qa.num_qa_labels
--- a/tests/test_modeling_marian.py
+++ b/tests/test_modeling_marian.py
@@ -50,7 +50,6 @@ class ModelTester:
            decoder_ffn_dim=32,
            max_position_embeddings=48,
            add_final_layer_norm=True,
-            return_dict=True,
        )

    def prepare_config_and_inputs_for_common(self):
--- a/tests/test_modeling_mbart.py
+++ b/tests/test_modeling_mbart.py
@@ -37,7 +37,6 @@ class ModelTester:
            decoder_ffn_dim=32,
            max_position_embeddings=48,
            add_final_layer_norm=True,
-            return_dict=True,
        )

    def prepare_config_and_inputs_for_common(self):
@@ -132,7 +131,6 @@ class MBartEnroIntegrationTest(AbstractSeq2SeqIntegrationTest):
            decoder_ffn_dim=32,
            max_position_embeddings=48,
            add_final_layer_norm=True,
-            return_dict=True,
        )
        lm_model = MBartForConditionalGeneration(config).to(torch_device)
        context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long().to(torch_device)
--- a/tests/test_modeling_mobilebert.py
+++ b/tests/test_modeling_mobilebert.py
@@ -124,7 +124,6 @@ class MobileBertModelTester:
            type_vocab_size=self.type_vocab_size,
            is_decoder=False,
            initializer_range=self.initializer_range,
-            return_dict=True,
        )

        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
--- a/tests/test_modeling_openai.py
+++ b/tests/test_modeling_openai.py
@@ -94,7 +94,6 @@ class OpenAIGPTModelTester:
            # type_vocab_size=self.type_vocab_size,
            # initializer_range=self.initializer_range
            pad_token_id=self.pad_token_id,
-            return_dict=True,
        )

        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
--- a/tests/test_modeling_pegasus.py
+++ b/tests/test_modeling_pegasus.py
@@ -33,7 +33,6 @@ class ModelTester:
            decoder_ffn_dim=32,
            max_position_embeddings=48,
            add_final_layer_norm=True,
-            return_dict=True,
        )

    def prepare_config_and_inputs_for_common(self):
--- a/tests/test_modeling_prophetnet.py
+++ b/tests/test_modeling_prophetnet.py
@@ -142,7 +142,6 @@ class ProphetNetModelTester:
            disable_ngram_loss=self.disable_ngram_loss,
            max_position_embeddings=self.max_position_embeddings,
            is_encoder_decoder=self.is_encoder_decoder,
-            return_dict=True,
        )

        return (
@@ -344,7 +343,6 @@ class ProphetNetModelTester:
                decoder_input_ids=decoder_input_ids,
                attention_mask=attention_mask,
                decoder_attention_mask=decoder_attention_mask,
-                return_dict=True,
            )

            tied_model_result = tied_model(
@@ -352,7 +350,6 @@ class ProphetNetModelTester:
                decoder_input_ids=decoder_input_ids,
                attention_mask=attention_mask,
                decoder_attention_mask=decoder_attention_mask,
-                return_dict=True,
            )

            # check that models has less parameters
@@ -419,7 +416,6 @@ class ProphetNetModelTester:
                attention_mask=attention_mask,
                decoder_attention_mask=decoder_attention_mask,
                labels=lm_labels,
-                return_dict=True,
            )
        self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(128.2925, device=torch_device), atol=1e-3))

@@ -433,9 +429,7 @@ class ProphetNetModelTester:
        model.to(torch_device)
        model.eval()

-        outputs_no_mask = model(
-            input_ids=input_ids[:, :5], decoder_input_ids=decoder_input_ids[:, :5], return_dict=True
-        )
+        outputs_no_mask = model(input_ids=input_ids[:, :5], decoder_input_ids=decoder_input_ids[:, :5])
        attention_mask = torch.ones_like(input_ids)
        decoder_attention_mask = torch.ones_like(decoder_input_ids)

@@ -446,7 +440,6 @@ class ProphetNetModelTester:
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
-            return_dict=True,
        )

        # check encoder
@@ -524,7 +517,6 @@ class ProphetNetStandaloneDecoderModelTester:
        bos_token_id=1,
        eos_token_id=2,
        ngram=2,
-        return_dict=True,
        num_buckets=32,
        relative_max_distance=128,
        disable_ngram_loss=False,
@@ -562,7 +554,6 @@ class ProphetNetStandaloneDecoderModelTester:
        self.max_position_embeddings = max_position_embeddings
        self.add_cross_attention = add_cross_attention
        self.is_encoder_decoder = is_encoder_decoder
-        self.return_dict = return_dict

        self.scope = None
        self.decoder_key_length = decoder_seq_length
@@ -602,7 +593,6 @@ class ProphetNetStandaloneDecoderModelTester:
            max_position_embeddings=self.max_position_embeddings,
            add_cross_attention=self.add_cross_attention,
            is_encoder_decoder=self.is_encoder_decoder,
-            return_dict=self.return_dict,
        )

        return (
@@ -757,7 +747,6 @@ class ProphetNetStandaloneEncoderModelTester:
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
-        return_dict=True,
        num_buckets=32,
        relative_max_distance=128,
        disable_ngram_loss=False,
@@ -794,7 +783,6 @@ class ProphetNetStandaloneEncoderModelTester:
        self.max_position_embeddings = max_position_embeddings
        self.add_cross_attention = add_cross_attention
        self.is_encoder_decoder = is_encoder_decoder
-        self.return_dict = return_dict

        self.scope = None
        self.decoder_key_length = decoder_seq_length
@@ -829,7 +817,6 @@ class ProphetNetStandaloneEncoderModelTester:
            max_position_embeddings=self.max_position_embeddings,
            add_cross_attention=self.add_cross_attention,
            is_encoder_decoder=self.is_encoder_decoder,
-            return_dict=self.return_dict,
        )

        return (
@@ -919,7 +906,6 @@ class ProphetNetModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
    # methods overwrite method in `test_modeling_common.py`
    def test_attention_outputs(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True

        seq_len = getattr(self.model_tester, "seq_length", None)
        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
@@ -933,7 +919,6 @@ class ProphetNetModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
        for model_class in self.all_model_classes:
            inputs_dict["output_attentions"] = True
            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
            model = model_class(config)
            model.to(torch_device)
            model.eval()
@@ -1121,7 +1106,6 @@ class ProphetNetModelIntegrationTest(unittest.TestCase):
            attention_mask=None,
            encoder_outputs=None,
            decoder_input_ids=decoder_prev_ids,
-            return_dict=True,
        )
        output_predited_logits = output[0]
        expected_shape = torch.Size((1, 12, 30522))
@@ -1143,9 +1127,7 @@ class ProphetNetModelIntegrationTest(unittest.TestCase):
        assert torch.allclose(encoder_outputs[:, :3, :3], expected_encoder_outputs_slice, atol=1e-4)

        # decoder outputs
-        decoder_outputs = model.prophetnet.decoder(
-            decoder_prev_ids, encoder_hidden_states=encoder_outputs, return_dict=True
-        )
+        decoder_outputs = model.prophetnet.decoder(decoder_prev_ids, encoder_hidden_states=encoder_outputs)
        predicting_streams = decoder_outputs[1].view(1, model.config.ngram, 12, -1)
        predicting_streams_logits = model.lm_head(predicting_streams)
        next_first_stream_logits = predicting_streams_logits[:, 0]
--- a/tests/test_modeling_reformer.py
+++ b/tests/test_modeling_reformer.py
@@ -174,7 +174,6 @@ class ReformerModelTester:
            attn_layers=self.attn_layers,
            pad_token_id=self.pad_token_id,
            hash_seed=self.hash_seed,
-            return_dict=True,
        )

        return (
--- a/tests/test_modeling_roberta.py
+++ b/tests/test_modeling_roberta.py
@@ -103,7 +103,6 @@ class RobertaModelTester:
            max_position_embeddings=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
            initializer_range=self.initializer_range,
-            return_dict=True,
        )

        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
--- a/tests/test_modeling_squeezebert.py
+++ b/tests/test_modeling_squeezebert.py
@@ -131,7 +131,6 @@ if is_torch_available():
                post_attention_groups=self.post_attention_groups,
                intermediate_groups=self.intermediate_groups,
                output_groups=self.output_groups,
-                return_dict=True,
            )

            return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
--- a/tests/test_modeling_t5.py
+++ b/tests/test_modeling_t5.py
@@ -115,7 +115,6 @@ class T5ModelTester:
            bos_token_id=self.pad_token_id,
            pad_token_id=self.pad_token_id,
            decoder_start_token_id=self.decoder_start_token_id,
-            return_dict=True,
        )

        return (
--- a/tests/test_modeling_tf_albert.py
+++ b/tests/test_modeling_tf_albert.py
@@ -121,7 +121,6 @@ class TFAlbertModelTester:
            max_position_embeddings=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
            initializer_range=self.initializer_range,
-            return_dict=True,
        )

        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
--- a/tests/test_modeling_tf_bart.py
+++ b/tests/test_modeling_tf_bart.py
@@ -182,7 +182,6 @@ class TFBartHeadTests(unittest.TestCase):
            eos_token_id=2,
            pad_token_id=1,
            bos_token_id=0,
-            return_dict=True,
            decoder_start_token_id=2,
        )
        return config, input_ids, batch_size
@@ -206,7 +205,6 @@ class TFBartHeadTests(unittest.TestCase):
            encoder_ffn_dim=32,
            decoder_ffn_dim=32,
            max_position_embeddings=48,
-            return_dict=True,
        )
        lm_model = TFBartForConditionalGeneration(config)
        context = tf.fill((7, 2), 4)
@@ -356,7 +354,7 @@ class FasterTFBartModelIntegrationTests(unittest.TestCase):
            padding="longest",
            truncation=True,
        )
-        features = self.xsum_1_1_model.get_encoder()(**batch, return_dict=True).last_hidden_state
+        features = self.xsum_1_1_model.get_encoder()(**batch).last_hidden_state
        import numpy as np

        expected = np.array([[-0.0828, -0.0251, -0.0674], [0.1277, 0.3311, -0.0255], [0.2613, -0.0840, -0.2763]])
--- a/tests/test_modeling_tf_bert.py
+++ b/tests/test_modeling_tf_bert.py
@@ -120,7 +120,6 @@ class TFBertModelTester:
            max_position_embeddings=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
            initializer_range=self.initializer_range,
-            return_dict=True,
        )

        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
--- a/tests/test_modeling_tf_camembert.py
+++ b/tests/test_modeling_tf_camembert.py
@@ -39,7 +39,7 @@ class TFCamembertModelIntegrationTest(unittest.TestCase):
            dtype=tf.int32,
        )  # J'aime le camembert !"

-        output = model(input_ids, return_dict=True)["last_hidden_state"]
+        output = model(input_ids)["last_hidden_state"]
        expected_shape = tf.TensorShape((1, 10, 768))
        self.assertEqual(output.shape, expected_shape)
        # compare the actual values for a slice.
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -284,7 +284,7 @@ class TFModelTesterMixin:
        if isinstance(after_outputs, tf.Tensor):
            out_1 = after_outputs.numpy()
        elif isinstance(after_outputs, dict):
-            out_1 = after_outputs[list(after_outputs.keys())[0]]
+            out_1 = after_outputs[list(after_outputs.keys())[0]].numpy()
        else:
            out_1 = after_outputs[0].numpy()
        out_2 = outputs[0].numpy()
--- a/tests/test_modeling_tf_ctrl.py
+++ b/tests/test_modeling_tf_ctrl.py
@@ -94,7 +94,6 @@ class TFCTRLModelTester(object):
            n_ctx=self.max_position_embeddings,
            # type_vocab_size=self.type_vocab_size,
            # initializer_range=self.initializer_range,
-            return_dict=True,
        )

        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
--- a/tests/test_modeling_tf_distilbert.py
+++ b/tests/test_modeling_tf_distilbert.py
@@ -91,7 +91,6 @@ class TFDistilBertModelTester:
            attention_dropout=self.attention_probs_dropout_prob,
            max_position_embeddings=self.max_position_embeddings,
            initializer_range=self.initializer_range,
-            return_dict=True,
        )

        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
--- a/tests/test_modeling_tf_electra.py
+++ b/tests/test_modeling_tf_electra.py
@@ -97,7 +97,6 @@ class TFElectraModelTester:
            max_position_embeddings=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
            initializer_range=self.initializer_range,
-            return_dict=True,
        )

        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
--- a/tests/test_modeling_tf_flaubert.py
+++ b/tests/test_modeling_tf_flaubert.py
@@ -114,7 +114,6 @@ class TFFlaubertModelTester:
            summary_type=self.summary_type,
            use_proj=self.use_proj,
            bos_token_id=self.bos_token_id,
-            return_dict=True,
        )

        return (
--- a/tests/test_modeling_tf_funnel.py
+++ b/tests/test_modeling_tf_funnel.py
@@ -137,7 +137,6 @@ class TFFunnelModelTester:
            activation_dropout=self.activation_dropout,
            max_position_embeddings=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
-            return_dict=True,
        )

        return (
--- a/tests/test_modeling_tf_gpt2.py
+++ b/tests/test_modeling_tf_gpt2.py
@@ -104,7 +104,6 @@ class TFGPT2ModelTester:
            # initializer_range=self.initializer_range
            bos_token_id=self.bos_token_id,
            eos_token_id=self.eos_token_id,
-            return_dict=True,
        )

        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
--- a/tests/test_modeling_tf_longformer.py
+++ b/tests/test_modeling_tf_longformer.py
@@ -594,7 +594,9 @@ class TFLongformerModelIntegrationTest(unittest.TestCase):
        # 'Hello world! ' repeated 1000 times
        input_ids = tf.convert_to_tensor([[0] + [20920, 232, 328, 1437] * 1000 + [2]], dtype=tf.dtypes.int32)

-        loss, prediction_scores = model(input_ids, labels=input_ids)
+        output = model(input_ids, labels=input_ids)
+        loss = output.loss
+        prediction_scores = output.logits

        expected_loss = tf.constant(0.0073798)
        expected_prediction_scores_sum = tf.constant(-610476600.0)
--- a/tests/test_modeling_tf_lxmert.py
+++ b/tests/test_modeling_tf_lxmert.py
@@ -297,7 +297,6 @@ class TFLxmertModelTester(object):
            matched_label=matched_label,
            ans=ans,
            output_attentions=output_attentions,
-            return_dict=True,
        )
        result = model(
            input_ids,
@@ -352,7 +351,6 @@ class TFLxmertModelTester(object):
            matched_label=matched_label,
            ans=ans,
            output_attentions=not output_attentions,
-            return_dict=True,
        )

        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
@@ -695,7 +693,8 @@ class TFLxmertModelTest(TFModelTesterMixin, unittest.TestCase):
                model = tf.keras.models.load_model(tmpdirname)
                outputs = model(class_inputs_dict)

-                language_hidden_states, vision_hidden_states = outputs[-2], outputs[-1]
+                language_hidden_states = outputs["language_hidden_states"]
+                vision_hidden_states = outputs["vision_hidden_states"]

                self.assertEqual(len(language_hidden_states), self.model_tester.num_hidden_layers["language"] + 1)
                self.assertEqual(len(vision_hidden_states), self.model_tester.num_hidden_layers["vision"] + 1)
@@ -731,11 +730,9 @@ class TFLxmertModelTest(TFModelTesterMixin, unittest.TestCase):
                model = tf.keras.models.load_model(tmpdirname)
                outputs = model(class_inputs_dict)

-                language_attentions, vision_attentions, cross_encoder_attentions = (
-                    outputs[-3],
-                    outputs[-2],
-                    outputs[-1],
-                )
+                language_attentions = outputs["language_attentions"]
+                vision_attentions = outputs["vision_attentions"]
+                cross_encoder_attentions = outputs["cross_encoder_attentions"]

                self.assertEqual(len(language_attentions), self.model_tester.num_hidden_layers["language"])
                self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers["vision"])
--- a/tests/test_modeling_tf_mobilebert.py
+++ b/tests/test_modeling_tf_mobilebert.py
@@ -139,7 +139,6 @@ class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase):
                type_vocab_size=self.type_vocab_size,
                initializer_range=self.initializer_range,
                embedding_size=self.embedding_size,
-                return_dict=True,
            )

            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
--- a/tests/test_modeling_tf_openai.py
+++ b/tests/test_modeling_tf_openai.py
@@ -99,7 +99,6 @@ class TFOpenAIGPTModelTester:
            n_ctx=self.max_position_embeddings,
            # type_vocab_size=self.type_vocab_size,
            # initializer_range=self.initializer_range,
-            return_dict=True,
        )

        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
--- a/tests/test_modeling_tf_roberta.py
+++ b/tests/test_modeling_tf_roberta.py
@@ -97,7 +97,6 @@ class TFRobertaModelTester:
            max_position_embeddings=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
            initializer_range=self.initializer_range,
-            return_dict=True,
        )

        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
--- a/tests/test_modeling_tf_t5.py
+++ b/tests/test_modeling_tf_t5.py
@@ -78,7 +78,6 @@ class TFT5ModelTester:
            bos_token_id=self.pad_token_id,
            pad_token_id=self.pad_token_id,
            decoder_start_token_id=self.pad_token_id,
-            return_dict=True,
        )

        return (config, input_ids, input_mask, token_labels)
--- a/tests/test_modeling_tf_transfo_xl.py
+++ b/tests/test_modeling_tf_transfo_xl.py
@@ -77,7 +77,6 @@ class TFTransfoXLModelTester:
            div_val=self.div_val,
            n_layer=self.num_hidden_layers,
            eos_token_id=self.eos_token_id,
-            return_dict=True,
        )

        return (config, input_ids_1, input_ids_2, lm_labels)
--- a/tests/test_modeling_tf_xlm.py
+++ b/tests/test_modeling_tf_xlm.py
@@ -114,7 +114,6 @@ class TFXLMModelTester:
            summary_type=self.summary_type,
            use_proj=self.use_proj,
            bos_token_id=self.bos_token_id,
-            return_dict=True,
        )

        return (
--- a/tests/test_modeling_tf_xlm_roberta.py
+++ b/tests/test_modeling_tf_xlm_roberta.py
@@ -39,7 +39,7 @@ class TFFlaubertModelIntegrationTest(unittest.TestCase):
            "attention_mask": tf.convert_to_tensor([[1, 1, 1, 1, 1, 1]], dtype=tf.int32),
        }

-        output = model(features, return_dict=True)["last_hidden_state"]
+        output = model(features)["last_hidden_state"]
        expected_shape = tf.TensorShape((1, 6, 768))
        self.assertEqual(output.shape, expected_shape)
        # compare the actual values for a slice.
--- a/tests/test_modeling_tf_xlnet.py
+++ b/tests/test_modeling_tf_xlnet.py
@@ -111,7 +111,6 @@ class TFXLNetModelTester:
            bos_token_id=self.bos_token_id,
            pad_token_id=self.pad_token_id,
            eos_token_id=self.eos_token_id,
-            return_dict=True,
        )

        return (
--- a/tests/test_modeling_transfo_xl.py
+++ b/tests/test_modeling_transfo_xl.py
@@ -78,7 +78,6 @@ class TransfoXLModelTester:
            div_val=self.div_val,
            n_layer=self.num_hidden_layers,
            eos_token_id=self.eos_token_id,
-            return_dict=True,
        )

        return (config, input_ids_1, input_ids_2, lm_labels)
--- a/tests/test_modeling_xlm.py
+++ b/tests/test_modeling_xlm.py
@@ -116,7 +116,6 @@ class XLMModelTester:
            use_proj=self.use_proj,
            num_labels=self.num_labels,
            bos_token_id=self.bos_token_id,
-            return_dict=True,
        )

        return (
--- a/tests/test_modeling_xlm_roberta.py
+++ b/tests/test_modeling_xlm_roberta.py
@@ -32,7 +32,7 @@ if is_torch_available():
 class XLMRobertaModelIntegrationTest(unittest.TestCase):
    @slow
    def test_xlm_roberta_base(self):
-        model = XLMRobertaModel.from_pretrained("xlm-roberta-base", return_dict=True)
+        model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
        input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
        # The dog is cute and lives in the garden house

@@ -51,7 +51,7 @@ class XLMRobertaModelIntegrationTest(unittest.TestCase):

    @slow
    def test_xlm_roberta_large(self):
-        model = XLMRobertaModel.from_pretrained("xlm-roberta-large", return_dict=True)
+        model = XLMRobertaModel.from_pretrained("xlm-roberta-large")
        input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
        # The dog is cute and lives in the garden house

--- a/tests/test_modeling_xlnet.py
+++ b/tests/test_modeling_xlnet.py
@@ -148,7 +148,6 @@ class XLNetModelTester:
            bos_token_id=self.bos_token_id,
            pad_token_id=self.pad_token_id,
            eos_token_id=self.eos_token_id,
-            return_dict=True,
        )

        return (