diff --git a/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py b/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py
index 4a1d66b591..d7689d2810 100644
--- a/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import unittest
+
 from transformers import XxxConfig, is_tf_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFCommonTestCases, ids_tensor
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
 
@@ -32,7 +34,7 @@ if is_tf_available():
 
 
 @require_tf
-class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
+class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
         (
diff --git a/templates/adding_a_new_model/tests/test_modeling_xxx.py b/templates/adding_a_new_model/tests/test_modeling_xxx.py
index b6364447b4..d66a91e3e4 100644
--- a/templates/adding_a_new_model/tests/test_modeling_xxx.py
+++ b/templates/adding_a_new_model/tests/test_modeling_xxx.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import unittest
+
 from transformers import is_torch_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_common import CommonTestCases, ids_tensor
+from .test_modeling_common import ModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
@@ -34,7 +36,7 @@ if is_torch_available():
 
 
 @require_torch
-class XxxModelTest(CommonTestCases.CommonModelTester):
+class XxxModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
         (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering, XxxForSequenceClassification, XxxForTokenClassification)
diff --git a/tests/test_modeling_albert.py b/tests/test_modeling_albert.py
index eeecb0d4d4..67d4b1bbba 100644
--- a/tests/test_modeling_albert.py
+++ b/tests/test_modeling_albert.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import unittest
+
 from transformers import is_torch_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_common import CommonTestCases, ids_tensor
+from .test_modeling_common import ModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
@@ -33,7 +35,7 @@ if is_torch_available():
 
 
 @require_torch
-class AlbertModelTest(CommonTestCases.CommonModelTester):
+class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (AlbertModel, AlbertForMaskedLM) if is_torch_available() else ()
 
diff --git a/tests/test_modeling_bert.py b/tests/test_modeling_bert.py
index 52aaece535..3909e3e95e 100644
--- a/tests/test_modeling_bert.py
+++ b/tests/test_modeling_bert.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import unittest
+
 from transformers import is_torch_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_common import CommonTestCases, floats_tensor, ids_tensor
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
@@ -37,7 +39,7 @@ if is_torch_available():
 
 
 @require_torch
-class BertModelTest(CommonTestCases.CommonModelTester):
+class BertModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
         (
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 0d1cfbd311..b2ef0c74d9 100644
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -69,737 +69,737 @@ def _config_zero_init(config):
     return configs_no_init
 
 
-class CommonTestCases:
-    @require_torch
-    class CommonModelTester(unittest.TestCase):
+@require_torch
+class ModelTesterMixin:
 
-        model_tester = None
-        all_model_classes = ()
-        test_torchscript = True
-        test_pruning = True
-        test_resize_embeddings = True
-        test_head_masking = True
-        is_encoder_decoder = False
+    model_tester = None
+    all_model_classes = ()
+    test_torchscript = True
+    test_pruning = True
+    test_resize_embeddings = True
+    test_head_masking = True
+    is_encoder_decoder = False
 
-        def test_save_load(self):
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+    def test_save_load(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-            for model_class in self.all_model_classes:
-                model = model_class(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**inputs_dict)
+            out_2 = outputs[0].numpy()
+            out_2[np.isnan(out_2)] = 0
+
+            with TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model = model_class.from_pretrained(tmpdirname)
                 model.to(torch_device)
-                model.eval()
                 with torch.no_grad():
-                    outputs = model(**inputs_dict)
-                out_2 = outputs[0].numpy()
-                out_2[np.isnan(out_2)] = 0
+                    after_outputs = model(**inputs_dict)
 
-                with TemporaryDirectory() as tmpdirname:
-                    model.save_pretrained(tmpdirname)
-                    model = model_class.from_pretrained(tmpdirname)
-                    model.to(torch_device)
-                    with torch.no_grad():
-                        after_outputs = model(**inputs_dict)
-
-                    # Make sure we don't have nans
-                    out_1 = after_outputs[0].cpu().numpy()
-                    out_1[np.isnan(out_1)] = 0
-                    max_diff = np.amax(np.abs(out_1 - out_2))
-                    self.assertLessEqual(max_diff, 1e-5)
-
-        def test_initialization(self):
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            configs_no_init = _config_zero_init(config)
-            for model_class in self.all_model_classes:
-                model = model_class(config=configs_no_init)
-                for name, param in model.named_parameters():
-                    if param.requires_grad:
-                        self.assertIn(
-                            param.data.mean().item(),
-                            [0.0, 1.0],
-                            msg="Parameter {} of model {} seems not properly initialized".format(name, model_class),
-                        )
-
-        def test_determinism(self):
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            for model_class in self.all_model_classes:
-                model = model_class(config)
-                model.to(torch_device)
-                model.eval()
-                with torch.no_grad():
-                    first = model(**inputs_dict)[0]
-                    second = model(**inputs_dict)[0]
-                out_1 = first.cpu().numpy()
-                out_2 = second.cpu().numpy()
-                out_1 = out_1[~np.isnan(out_1)]
-                out_2 = out_2[~np.isnan(out_2)]
+                # Make sure we don't have nans
+                out_1 = after_outputs[0].cpu().numpy()
+                out_1[np.isnan(out_1)] = 0
                 max_diff = np.amax(np.abs(out_1 - out_2))
                 self.assertLessEqual(max_diff, 1e-5)
 
-        def test_attention_outputs(self):
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-            decoder_seq_length = (
-                self.model_tester.decoder_seq_length
-                if hasattr(self.model_tester, "decoder_seq_length")
-                else self.model_tester.seq_length
-            )
-            encoder_seq_length = (
-                self.model_tester.encoder_seq_length
-                if hasattr(self.model_tester, "encoder_seq_length")
-                else self.model_tester.seq_length
-            )
-            decoder_key_length = (
-                self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length
-            )
-            encoder_key_length = (
-                self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length
-            )
-
-            for model_class in self.all_model_classes:
-                config.output_attentions = True
-                config.output_hidden_states = False
-                model = model_class(config)
-                model.to(torch_device)
-                model.eval()
-                with torch.no_grad():
-                    outputs = model(**inputs_dict)
-                attentions = outputs[-1]
-                self.assertEqual(model.config.output_attentions, True)
-                self.assertEqual(model.config.output_hidden_states, False)
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-                out_len = len(outputs)
-
-                if self.is_encoder_decoder:
-                    self.assertEqual(out_len % 2, 0)
-                    decoder_attentions = outputs[(out_len // 2) - 1]
-                    self.assertEqual(model.config.output_attentions, True)
-                    self.assertEqual(model.config.output_hidden_states, False)
-                    self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-                    self.assertListEqual(
-                        list(decoder_attentions[0].shape[-3:]),
-                        [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    self.assertIn(
+                        param.data.mean().item(),
+                        [0.0, 1.0],
+                        msg="Parameter {} of model {} seems not properly initialized".format(name, model_class),
                     )
 
-                # Check attention is always last and order is fine
-                config.output_attentions = True
-                config.output_hidden_states = True
-                model = model_class(config)
-                model.to(torch_device)
-                model.eval()
-                with torch.no_grad():
-                    outputs = model(**inputs_dict)
-                self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
-                self.assertEqual(model.config.output_attentions, True)
-                self.assertEqual(model.config.output_hidden_states, True)
+    def test_determinism(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-                self_attentions = outputs[-1]
-                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                first = model(**inputs_dict)[0]
+                second = model(**inputs_dict)[0]
+            out_1 = first.cpu().numpy()
+            out_2 = second.cpu().numpy()
+            out_1 = out_1[~np.isnan(out_1)]
+            out_2 = out_2[~np.isnan(out_2)]
+            max_diff = np.amax(np.abs(out_1 - out_2))
+            self.assertLessEqual(max_diff, 1e-5)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        decoder_seq_length = (
+            self.model_tester.decoder_seq_length
+            if hasattr(self.model_tester, "decoder_seq_length")
+            else self.model_tester.seq_length
+        )
+        encoder_seq_length = (
+            self.model_tester.encoder_seq_length
+            if hasattr(self.model_tester, "encoder_seq_length")
+            else self.model_tester.seq_length
+        )
+        decoder_key_length = (
+            self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length
+        )
+        encoder_key_length = (
+            self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length
+        )
+
+        for model_class in self.all_model_classes:
+            config.output_attentions = True
+            config.output_hidden_states = False
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**inputs_dict)
+            attentions = outputs[-1]
+            self.assertEqual(model.config.output_attentions, True)
+            self.assertEqual(model.config.output_hidden_states, False)
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+            out_len = len(outputs)
+
+            if self.is_encoder_decoder:
+                self.assertEqual(out_len % 2, 0)
+                decoder_attentions = outputs[(out_len // 2) - 1]
+                self.assertEqual(model.config.output_attentions, True)
+                self.assertEqual(model.config.output_hidden_states, False)
+                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
                 self.assertListEqual(
-                    list(self_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                    list(decoder_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
                 )
 
-        def test_torchscript(self):
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            self._create_and_check_torchscript(config, inputs_dict)
-
-        def test_torchscript_output_attentions(self):
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
+            # Check attention is always last and order is fine
             config.output_attentions = True
-            self._create_and_check_torchscript(config, inputs_dict)
-
-        def test_torchscript_output_hidden_state(self):
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
             config.output_hidden_states = True
-            self._create_and_check_torchscript(config, inputs_dict)
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**inputs_dict)
+            self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
+            self.assertEqual(model.config.output_attentions, True)
+            self.assertEqual(model.config.output_hidden_states, True)
 
-        def _create_and_check_torchscript(self, config, inputs_dict):
-            if not self.test_torchscript:
-                return
+            self_attentions = outputs[-1]
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
 
-            configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-            configs_no_init.torchscript = True
-            for model_class in self.all_model_classes:
-                model = model_class(config=configs_no_init)
-                model.to(torch_device)
-                model.eval()
-                inputs = inputs_dict["input_ids"]  # Let's keep only input_ids
+    def test_torchscript(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        self._create_and_check_torchscript(config, inputs_dict)
+
+    def test_torchscript_output_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        config.output_attentions = True
+        self._create_and_check_torchscript(config, inputs_dict)
+
+    def test_torchscript_output_hidden_state(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        config.output_hidden_states = True
+        self._create_and_check_torchscript(config, inputs_dict)
+
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            inputs = inputs_dict["input_ids"]  # Let's keep only input_ids
+
+            try:
+                traced_gpt2 = torch.jit.trace(model, inputs)
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
 
                 try:
-                    traced_gpt2 = torch.jit.trace(model, inputs)
-                except RuntimeError:
-                    self.fail("Couldn't trace module.")
+                    torch.jit.save(traced_gpt2, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
 
-                with TemporaryDirectory() as tmp_dir_name:
-                    pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
 
-                    try:
-                        torch.jit.save(traced_gpt2, pt_file_name)
-                    except Exception:
-                        self.fail("Couldn't save module.")
+            model.to(torch_device)
+            model.eval()
 
-                    try:
-                        loaded_model = torch.jit.load(pt_file_name)
-                    except Exception:
-                        self.fail("Couldn't load module.")
+            loaded_model.to(torch_device)
+            loaded_model.eval()
 
-                model.to(torch_device)
-                model.eval()
+            model_params = model.parameters()
+            loaded_model_params = loaded_model.parameters()
 
-                loaded_model.to(torch_device)
-                loaded_model.eval()
+            models_equal = True
+            for p1, p2 in zip(model_params, loaded_model_params):
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
 
-                model_params = model.parameters()
-                loaded_model_params = loaded_model.parameters()
+            self.assertTrue(models_equal)
 
-                models_equal = True
-                for p1, p2 in zip(model_params, loaded_model_params):
-                    if p1.data.ne(p2.data).sum() > 0:
-                        models_equal = False
+    def test_headmasking(self):
+        if not self.test_head_masking:
+            return
 
-                self.assertTrue(models_equal)
+        global_rng.seed(42)
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        global_rng.seed()
 
-        def test_headmasking(self):
-            if not self.test_head_masking:
-                return
+        config.output_attentions = True
+        config.output_hidden_states = True
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
 
-            global_rng.seed(42)
+            # Prepare head_mask
+            # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
+            head_mask = torch.ones(
+                self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device
+            )
+            head_mask[0, 0] = 0
+            head_mask[-1, :-1] = 0
+            head_mask.requires_grad_(requires_grad=True)
+            inputs = inputs_dict.copy()
+            inputs["head_mask"] = head_mask
+
+            outputs = model(**inputs)
+
+            # Test that we can get a gradient back for importance score computation
+            output = sum(t.sum() for t in outputs[0])
+            output = output.sum()
+            output.backward()
+            multihead_outputs = head_mask.grad
+
+            attentions = outputs[-1]
+            hidden_states = outputs[-2]
+
+            # Remove Nan
+            for t in attentions:
+                self.assertLess(
+                    torch.sum(torch.isnan(t)), t.numel() / 4
+                )  # Check we don't have more than 25% nans (arbitrary)
+            attentions = [
+                t.masked_fill(torch.isnan(t), 0.0) for t in attentions
+            ]  # remove them (the test is less complete)
+
+            self.assertIsNotNone(multihead_outputs)
+            self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
+            self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
+            self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
+            self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
+            self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
+            self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
+
+    def test_head_pruning(self):
+        if not self.test_pruning:
+            return
+
+        for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            global_rng.seed()
+
+            if "head_mask" in inputs_dict:
+                del inputs_dict["head_mask"]
 
             config.output_attentions = True
+            config.output_hidden_states = False
+            model = model_class(config=config)
+            model.to(torch_device)
+            model.eval()
+            heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
+            model.prune_heads(heads_to_prune)
+            with torch.no_grad():
+                outputs = model(**inputs_dict)
+
+            attentions = outputs[-1]
+
+            self.assertEqual(attentions[0].shape[-3], 1)
+            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+
+    def test_head_pruning_save_load_from_pretrained(self):
+        if not self.test_pruning:
+            return
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            if "head_mask" in inputs_dict:
+                del inputs_dict["head_mask"]
+
+            config.output_attentions = True
+            config.output_hidden_states = False
+            model = model_class(config=config)
+            model.to(torch_device)
+            model.eval()
+            heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
+            model.prune_heads(heads_to_prune)
+
+            with TemporaryDirectory() as temp_dir_name:
+                model.save_pretrained(temp_dir_name)
+                model = model_class.from_pretrained(temp_dir_name)
+                model.to(torch_device)
+
+            with torch.no_grad():
+                outputs = model(**inputs_dict)
+            attentions = outputs[-1]
+            self.assertEqual(attentions[0].shape[-3], 1)
+            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+
+    def test_head_pruning_save_load_from_config_init(self):
+        if not self.test_pruning:
+            return
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            if "head_mask" in inputs_dict:
+                del inputs_dict["head_mask"]
+
+            config.output_attentions = True
+            config.output_hidden_states = False
+
+            heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
+            config.pruned_heads = heads_to_prune
+
+            model = model_class(config=config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**inputs_dict)
+            attentions = outputs[-1]
+
+            self.assertEqual(attentions[0].shape[-3], 1)
+            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+
+    def test_head_pruning_integration(self):
+        if not self.test_pruning:
+            return
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            if "head_mask" in inputs_dict:
+                del inputs_dict["head_mask"]
+
+            config.output_attentions = True
+            config.output_hidden_states = False
+
+            heads_to_prune = {0: [0], 1: [1, 2]}
+            config.pruned_heads = heads_to_prune
+
+            model = model_class(config=config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**inputs_dict)
+            attentions = outputs[-1]
+
+            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
+            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
+            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
+
+            with TemporaryDirectory() as temp_dir_name:
+                model.save_pretrained(temp_dir_name)
+                model = model_class.from_pretrained(temp_dir_name)
+                model.to(torch_device)
+
+            with torch.no_grad():
+                outputs = model(**inputs_dict)
+            attentions = outputs[-1]
+
+            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
+            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
+            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
+
+            heads_to_prune = {0: [0], 2: [1, 2]}
+            model.prune_heads(heads_to_prune)
+
+            with torch.no_grad():
+                outputs = model(**inputs_dict)
+            attentions = outputs[-1]
+
+            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
+            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
+            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
+
+            self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
+
+    def test_hidden_states_output(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
             config.output_hidden_states = True
-            configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-            for model_class in self.all_model_classes:
-                model = model_class(config=configs_no_init)
-                model.to(torch_device)
-                model.eval()
+            config.output_attentions = False
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**inputs_dict)
+            hidden_states = outputs[-1]
+            self.assertEqual(model.config.output_attentions, False)
+            self.assertEqual(model.config.output_hidden_states, True)
+            self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [
+                    self.model_tester.encoder_seq_length
+                    if hasattr(self.model_tester, "encoder_seq_length")
+                    else self.model_tester.seq_length,
+                    self.model_tester.hidden_size,
+                ],
+            )
 
-                # Prepare head_mask
-                # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
-                head_mask = torch.ones(
-                    self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device
-                )
-                head_mask[0, 0] = 0
-                head_mask[-1, :-1] = 0
-                head_mask.requires_grad_(requires_grad=True)
-                inputs = inputs_dict.copy()
-                inputs["head_mask"] = head_mask
+    def test_resize_tokens_embeddings(self):
+        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
 
-                outputs = model(**inputs)
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
 
-                # Test that we can get a gradient back for importance score computation
-                output = sum(t.sum() for t in outputs[0])
-                output = output.sum()
-                output.backward()
-                multihead_outputs = head_mask.grad
+            model_vocab_size = config.vocab_size
+            # Retrieve the embeddings and clone theme
+            model_embed = model.resize_token_embeddings(model_vocab_size)
+            cloned_embeddings = model_embed.weight.clone()
 
-                attentions = outputs[-1]
-                hidden_states = outputs[-2]
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
 
-                # Remove Nan
-                for t in attentions:
-                    self.assertLess(
-                        torch.sum(torch.isnan(t)), t.numel() / 4
-                    )  # Check we don't have more than 25% nans (arbitrary)
-                attentions = [
-                    t.masked_fill(torch.isnan(t), 0.0) for t in attentions
-                ]  # remove them (the test is less complete)
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
 
-                self.assertIsNotNone(multihead_outputs)
-                self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
-                self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
-                self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
-                self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
-                self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
-                self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
+            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            models_equal = True
+            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
 
-        def test_head_pruning(self):
-            if not self.test_pruning:
-                return
+            self.assertTrue(models_equal)
 
-            for model_class in self.all_model_classes:
-                config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-                if "head_mask" in inputs_dict:
-                    del inputs_dict["head_mask"]
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Embedding, AdaptiveEmbedding))
+            model.set_input_embeddings(torch.nn.Embedding(10, 10))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, torch.nn.Linear))
 
-                config.output_attentions = True
-                config.output_hidden_states = False
-                model = model_class(config=config)
-                model.to(torch_device)
-                model.eval()
-                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
-                model.prune_heads(heads_to_prune)
-                with torch.no_grad():
-                    outputs = model(**inputs_dict)
+    def test_tie_model_weights(self):
+        if not self.test_torchscript:
+            return
 
-                attentions = outputs[-1]
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-                self.assertEqual(attentions[0].shape[-3], 1)
-                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-                self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+        def check_same_values(layer_1, layer_2):
+            equal = True
+            for p1, p2 in zip(layer_1.weight, layer_2.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    equal = False
+            return equal
 
-        def test_head_pruning_save_load_from_pretrained(self):
-            if not self.test_pruning:
-                return
+        for model_class in self.all_model_classes:
+            config.torchscript = True
+            model_not_tied = model_class(config)
+            if model_not_tied.get_output_embeddings() is None:
+                continue
 
-            for model_class in self.all_model_classes:
-                config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            params_not_tied = list(model_not_tied.parameters())
 
-                if "head_mask" in inputs_dict:
-                    del inputs_dict["head_mask"]
+            config_tied = copy.deepcopy(config)
+            config_tied.torchscript = False
+            model_tied = model_class(config_tied)
+            params_tied = list(model_tied.parameters())
 
-                config.output_attentions = True
-                config.output_hidden_states = False
-                model = model_class(config=config)
-                model.to(torch_device)
-                model.eval()
-                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
-                model.prune_heads(heads_to_prune)
+            # Check that the embedding layer and decoding layer are the same in size and in value
+            self.assertGreater(len(params_not_tied), len(params_tied))
+            # self.assertTrue(check_same_values(embeddings, decoding))
 
-                with TemporaryDirectory() as temp_dir_name:
-                    model.save_pretrained(temp_dir_name)
-                    model = model_class.from_pretrained(temp_dir_name)
-                    model.to(torch_device)
+            # # Check that after modification, they remain the same.
+            # embeddings.weight.data.div_(2)
+            # # Check that the embedding layer and decoding layer are the same in size and in value
+            # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
+            # self.assertTrue(check_same_values(embeddings, decoding))
 
-                with torch.no_grad():
-                    outputs = model(**inputs_dict)
-                attentions = outputs[-1]
-                self.assertEqual(attentions[0].shape[-3], 1)
-                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-                self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+            # # Check that after modification, they remain the same.
+            # decoding.weight.data.div_(4)
+            # # Check that the embedding layer and decoding layer are the same in size and in value
+            # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
+            # self.assertTrue(check_same_values(embeddings, decoding))
 
-        def test_head_pruning_save_load_from_config_init(self):
-            if not self.test_pruning:
-                return
+            # Check that after resize they remain tied.
+            model_tied.resize_token_embeddings(config.vocab_size + 10)
+            params_tied_2 = list(model_tied.parameters())
+            self.assertGreater(len(params_not_tied), len(params_tied))
+            self.assertEqual(len(params_tied_2), len(params_tied))
 
-            for model_class in self.all_model_classes:
-                config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            # decoding.weight.data.mul_(20)
+            # # Check that the embedding layer and decoding layer are the same in size and in value
+            # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
+            # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
 
-                if "head_mask" in inputs_dict:
-                    del inputs_dict["head_mask"]
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.is_encoder_decoder:
+            input_ids = inputs_dict["input_ids"]
+            del inputs_dict["input_ids"]
+        else:
+            encoder_input_ids = inputs_dict["encoder_input_ids"]
+            decoder_input_ids = inputs_dict["decoder_input_ids"]
+            del inputs_dict["encoder_input_ids"]
+            del inputs_dict["decoder_input_ids"]
 
-                config.output_attentions = True
-                config.output_hidden_states = False
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
 
-                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
-                config.pruned_heads = heads_to_prune
-
-                model = model_class(config=config)
-                model.to(torch_device)
-                model.eval()
-
-                with torch.no_grad():
-                    outputs = model(**inputs_dict)
-                attentions = outputs[-1]
-
-                self.assertEqual(attentions[0].shape[-3], 1)
-                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-                self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
-
-        def test_head_pruning_integration(self):
-            if not self.test_pruning:
-                return
-
-            for model_class in self.all_model_classes:
-                config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-                if "head_mask" in inputs_dict:
-                    del inputs_dict["head_mask"]
-
-                config.output_attentions = True
-                config.output_hidden_states = False
-
-                heads_to_prune = {0: [0], 1: [1, 2]}
-                config.pruned_heads = heads_to_prune
-
-                model = model_class(config=config)
-                model.to(torch_device)
-                model.eval()
-
-                with torch.no_grad():
-                    outputs = model(**inputs_dict)
-                attentions = outputs[-1]
-
-                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
-                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-                self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
-                self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
-
-                with TemporaryDirectory() as temp_dir_name:
-                    model.save_pretrained(temp_dir_name)
-                    model = model_class.from_pretrained(temp_dir_name)
-                    model.to(torch_device)
-
-                with torch.no_grad():
-                    outputs = model(**inputs_dict)
-                attentions = outputs[-1]
-
-                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
-                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-                self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
-                self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
-
-                heads_to_prune = {0: [0], 2: [1, 2]}
-                model.prune_heads(heads_to_prune)
-
-                with torch.no_grad():
-                    outputs = model(**inputs_dict)
-                attentions = outputs[-1]
-
-                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
-                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-                self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
-                self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
-
-                self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
-
-        def test_hidden_states_output(self):
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            for model_class in self.all_model_classes:
-                config.output_hidden_states = True
-                config.output_attentions = False
-                model = model_class(config)
-                model.to(torch_device)
-                model.eval()
-                with torch.no_grad():
-                    outputs = model(**inputs_dict)
-                hidden_states = outputs[-1]
-                self.assertEqual(model.config.output_attentions, False)
-                self.assertEqual(model.config.output_hidden_states, True)
-                self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
-                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [
-                        self.model_tester.encoder_seq_length
-                        if hasattr(self.model_tester, "encoder_seq_length")
-                        else self.model_tester.seq_length,
-                        self.model_tester.hidden_size,
-                    ],
-                )
-
-        def test_resize_tokens_embeddings(self):
-            original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            if not self.test_resize_embeddings:
-                return
-
-            for model_class in self.all_model_classes:
-                config = copy.deepcopy(original_config)
-                model = model_class(config)
-
-                model_vocab_size = config.vocab_size
-                # Retrieve the embeddings and clone theme
-                model_embed = model.resize_token_embeddings(model_vocab_size)
-                cloned_embeddings = model_embed.weight.clone()
-
-                # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-                model_embed = model.resize_token_embeddings(model_vocab_size + 10)
-                self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-                # Check that it actually resizes the embeddings matrix
-                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
-
-                # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-                model_embed = model.resize_token_embeddings(model_vocab_size - 15)
-                self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
-                # Check that it actually resizes the embeddings matrix
-                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
-
-                # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-                models_equal = True
-                for p1, p2 in zip(cloned_embeddings, model_embed.weight):
-                    if p1.data.ne(p2.data).sum() > 0:
-                        models_equal = False
-
-                self.assertTrue(models_equal)
-
-        def test_model_common_attributes(self):
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            for model_class in self.all_model_classes:
-                model = model_class(config)
-                self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Embedding, AdaptiveEmbedding))
-                model.set_input_embeddings(torch.nn.Embedding(10, 10))
-                x = model.get_output_embeddings()
-                self.assertTrue(x is None or isinstance(x, torch.nn.Linear))
-
-        def test_tie_model_weights(self):
-            if not self.test_torchscript:
-                return
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            def check_same_values(layer_1, layer_2):
-                equal = True
-                for p1, p2 in zip(layer_1.weight, layer_2.weight):
-                    if p1.data.ne(p2.data).sum() > 0:
-                        equal = False
-                return equal
-
-            for model_class in self.all_model_classes:
-                config.torchscript = True
-                model_not_tied = model_class(config)
-                if model_not_tied.get_output_embeddings() is None:
-                    continue
-
-                params_not_tied = list(model_not_tied.parameters())
-
-                config_tied = copy.deepcopy(config)
-                config_tied.torchscript = False
-                model_tied = model_class(config_tied)
-                params_tied = list(model_tied.parameters())
-
-                # Check that the embedding layer and decoding layer are the same in size and in value
-                self.assertGreater(len(params_not_tied), len(params_tied))
-                # self.assertTrue(check_same_values(embeddings, decoding))
-
-                # # Check that after modification, they remain the same.
-                # embeddings.weight.data.div_(2)
-                # # Check that the embedding layer and decoding layer are the same in size and in value
-                # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
-                # self.assertTrue(check_same_values(embeddings, decoding))
-
-                # # Check that after modification, they remain the same.
-                # decoding.weight.data.div_(4)
-                # # Check that the embedding layer and decoding layer are the same in size and in value
-                # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
-                # self.assertTrue(check_same_values(embeddings, decoding))
-
-                # Check that after resize they remain tied.
-                model_tied.resize_token_embeddings(config.vocab_size + 10)
-                params_tied_2 = list(model_tied.parameters())
-                self.assertGreater(len(params_not_tied), len(params_tied))
-                self.assertEqual(len(params_tied_2), len(params_tied))
-
-                # decoding.weight.data.mul_(20)
-                # # Check that the embedding layer and decoding layer are the same in size and in value
-                # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
-                # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
-
-        def test_inputs_embeds(self):
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            wte = model.get_input_embeddings()
             if not self.is_encoder_decoder:
-                input_ids = inputs_dict["input_ids"]
-                del inputs_dict["input_ids"]
+                inputs_dict["inputs_embeds"] = wte(input_ids)
             else:
-                encoder_input_ids = inputs_dict["encoder_input_ids"]
-                decoder_input_ids = inputs_dict["decoder_input_ids"]
-                del inputs_dict["encoder_input_ids"]
-                del inputs_dict["decoder_input_ids"]
-
-            for model_class in self.all_model_classes:
-                model = model_class(config)
-                model.to(torch_device)
-                model.eval()
-
-                wte = model.get_input_embeddings()
-                if not self.is_encoder_decoder:
-                    inputs_dict["inputs_embeds"] = wte(input_ids)
-                else:
-                    inputs_dict["encoder_inputs_embeds"] = wte(encoder_input_ids)
-                    inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-                with torch.no_grad():
-                    outputs = model(**inputs_dict)
-
-    class GPTModelTester(CommonModelTester):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_position_ids=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            n_positions=33,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            n_choices=3,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            scope=None,
-            config_class=None,
-            base_model_class=None,
-            lm_head_model_class=None,
-            double_head_model_class=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_position_ids = use_position_ids
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.n_positions = n_positions
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.n_choices = n_choices
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.scope = scope
-            self.config_class = config_class
-            self.base_model_class = base_model_class
-            self.lm_head_model_class = lm_head_model_class
-            self.double_head_model_class = double_head_model_class
-            self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class)
-
-        def prepare_config_and_inputs(self):
-            total_num_tokens = self.vocab_size
-            input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
-
-            position_ids = None
-            if self.use_position_ids:
-                position_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                total_voc = self.vocab_size
-                token_type_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
-
-            mc_labels = None
-            lm_labels = None
-            mc_token_ids = None
-            if self.use_labels:
-                mc_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                lm_labels = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
-                mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
-
-            config = self.config_class(
-                vocab_size=self.vocab_size,
-                n_positions=self.n_positions,
-                n_embd=self.hidden_size,
-                n_layer=self.num_hidden_layers,
-                n_head=self.num_attention_heads,
-                initializer_range=self.initializer_range,
-            )
-
-            return (config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids)
-
-        def create_and_check_base_model(
-            self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
-        ):
-            model = self.base_model_class(config)
-            model.to(torch_device)
-            model.eval()
+                inputs_dict["encoder_inputs_embeds"] = wte(encoder_input_ids)
+                inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids)
 
             with torch.no_grad():
-                outputs = model(input_ids, position_ids, token_type_ids)
-                outputs = model(input_ids, position_ids)
+                outputs = model(**inputs_dict)
+
+
+class GPTModelTester(ModelTesterMixin):
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_position_ids=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        n_positions=33,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        n_choices=3,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        scope=None,
+        config_class=None,
+        base_model_class=None,
+        lm_head_model_class=None,
+        double_head_model_class=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_position_ids = use_position_ids
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_choices = n_choices
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.scope = scope
+        self.config_class = config_class
+        self.base_model_class = base_model_class
+        self.lm_head_model_class = lm_head_model_class
+        self.double_head_model_class = double_head_model_class
+        self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class)
+
+    def prepare_config_and_inputs(self):
+        total_num_tokens = self.vocab_size
+        input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
+
+        position_ids = None
+        if self.use_position_ids:
+            position_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            total_voc = self.vocab_size
+            token_type_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
+
+        mc_labels = None
+        lm_labels = None
+        mc_token_ids = None
+        if self.use_labels:
+            mc_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            lm_labels = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
+            mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
+
+        config = self.config_class(
+            vocab_size=self.vocab_size,
+            n_positions=self.n_positions,
+            n_embd=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            initializer_range=self.initializer_range,
+        )
+
+        return (config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids)
+
+    def create_and_check_base_model(
+        self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
+    ):
+        model = self.base_model_class(config)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            outputs = model(input_ids, position_ids, token_type_ids)
+            outputs = model(input_ids, position_ids)
+            outputs = model(input_ids)
+
+        hidden_state = outputs[0]
+        self.parent.assertListEqual(
+            list(hidden_state.size()), [self.batch_size, self.n_choices, self.seq_length, self.hidden_size]
+        )
+
+    def create_and_check_lm_head(
+        self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
+    ):
+        model = self.lm_head_model_class(config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
+        loss, lm_logits = outputs[:2]
+
+        total_voc = self.vocab_size
+        self.parent.assertListEqual(
+            list(lm_logits.size()), [self.batch_size, self.n_choices, self.seq_length, total_voc]
+        )
+        self.parent.assertListEqual(list(loss.size()), [])
+
+    def create_and_check_presents(
+        self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
+    ):
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
                 outputs = model(input_ids)
-
-            hidden_state = outputs[0]
+            presents = outputs[-1]
+            self.parent.assertEqual(self.num_hidden_layers, len(presents))
             self.parent.assertListEqual(
-                list(hidden_state.size()), [self.batch_size, self.n_choices, self.seq_length, self.hidden_size]
+                list(presents[0].size()),
+                [
+                    2,
+                    self.batch_size * self.n_choices,
+                    self.num_attention_heads,
+                    self.seq_length,
+                    self.hidden_size // self.num_attention_heads,
+                ],
             )
 
-        def create_and_check_lm_head(
-            self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
-        ):
-            model = self.lm_head_model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
-            loss, lm_logits = outputs[:2]
-
-            total_voc = self.vocab_size
-            self.parent.assertListEqual(
-                list(lm_logits.size()), [self.batch_size, self.n_choices, self.seq_length, total_voc]
+    def create_and_check_double_heads(
+        self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
+    ):
+        model = self.double_head_model_class(config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            outputs = model(
+                input_ids,
+                mc_token_ids,
+                lm_labels=lm_labels,
+                mc_labels=mc_labels,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
             )
-            self.parent.assertListEqual(list(loss.size()), [])
+        lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
+        loss = [lm_loss, mc_loss]
 
-        def create_and_check_presents(
-            self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
-        ):
-            for model_class in self.all_model_classes:
-                model = model_class(config)
-                model.to(torch_device)
-                model.eval()
-                with torch.no_grad():
-                    outputs = model(input_ids)
-                presents = outputs[-1]
-                self.parent.assertEqual(self.num_hidden_layers, len(presents))
-                self.parent.assertListEqual(
-                    list(presents[0].size()),
-                    [
-                        2,
-                        self.batch_size * self.n_choices,
-                        self.num_attention_heads,
-                        self.seq_length,
-                        self.hidden_size // self.num_attention_heads,
-                    ],
-                )
+        total_voc = self.vocab_size
+        self.parent.assertListEqual(
+            list(lm_logits.size()), [self.batch_size, self.n_choices, self.seq_length, total_voc]
+        )
+        self.parent.assertListEqual(list(mc_logits.size()), [self.batch_size, self.n_choices])
+        self.parent.assertListEqual([list(l.size()) for l in loss], [[], []])
 
-        def create_and_check_double_heads(
-            self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
-        ):
-            model = self.double_head_model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(
-                    input_ids,
-                    mc_token_ids,
-                    lm_labels=lm_labels,
-                    mc_labels=mc_labels,
-                    token_type_ids=token_type_ids,
-                    position_ids=position_ids,
-                )
-            lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
-            loss = [lm_loss, mc_loss]
+    def create_and_check_model_from_pretrained(self):
+        for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
+            model = self.base_model_class.from_pretrained(model_name, cache_dir=CACHE_DIR)
+            self.parent.assertIsNotNone(model)
 
-            total_voc = self.vocab_size
-            self.parent.assertListEqual(
-                list(lm_logits.size()), [self.batch_size, self.n_choices, self.seq_length, total_voc]
-            )
-            self.parent.assertListEqual(list(mc_logits.size()), [self.batch_size, self.n_choices])
-            self.parent.assertListEqual([list(l.size()) for l in loss], [[], []])
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids}
+        return config, inputs_dict
 
-        def create_and_check_model_from_pretrained(self):
-            for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
-                model = self.base_model_class.from_pretrained(model_name, cache_dir=CACHE_DIR)
-                self.parent.assertIsNotNone(model)
+    def run_common_tests(self, test_presents=False):
+        config_and_inputs = self.prepare_config_and_inputs()
+        self.create_and_check_base_model(*config_and_inputs)
 
-        def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        self.create_and_check_lm_head(*config_and_inputs)
+
+        config_and_inputs = self.prepare_config_and_inputs()
+        self.create_and_check_double_heads(*config_and_inputs)
+
+        if test_presents:
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids}
-            return config, inputs_dict
+            self.create_and_check_presents(*config_and_inputs)
 
-        def run_common_tests(self, test_presents=False):
-            config_and_inputs = self.prepare_config_and_inputs()
-            self.create_and_check_base_model(*config_and_inputs)
-
-            config_and_inputs = self.prepare_config_and_inputs()
-            self.create_and_check_lm_head(*config_and_inputs)
-
-            config_and_inputs = self.prepare_config_and_inputs()
-            self.create_and_check_double_heads(*config_and_inputs)
-
-            if test_presents:
-                config_and_inputs = self.prepare_config_and_inputs()
-                self.create_and_check_presents(*config_and_inputs)
-
-        @slow
-        def run_slow_tests(self):
-            self.create_and_check_model_from_pretrained()
+    @slow
+    def run_slow_tests(self):
+        self.create_and_check_model_from_pretrained()
 
 
 class ConfigTester(object):
diff --git a/tests/test_modeling_ctrl.py b/tests/test_modeling_ctrl.py
index 519b529299..b2fa4941ab 100644
--- a/tests/test_modeling_ctrl.py
+++ b/tests/test_modeling_ctrl.py
@@ -13,10 +13,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import unittest
+
 from transformers import is_torch_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_common import CommonTestCases, ids_tensor
+from .test_modeling_common import ModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
@@ -25,7 +27,7 @@ if is_torch_available():
 
 
 @require_torch
-class CTRLModelTest(CommonTestCases.CommonModelTester):
+class CTRLModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (CTRLModel, CTRLLMHeadModel) if is_torch_available() else ()
     test_pruning = False
diff --git a/tests/test_modeling_distilbert.py b/tests/test_modeling_distilbert.py
index 6c83751bad..b63c50a8e0 100644
--- a/tests/test_modeling_distilbert.py
+++ b/tests/test_modeling_distilbert.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import unittest
+
 from transformers import is_torch_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_common import CommonTestCases, ids_tensor
+from .test_modeling_common import ModelTesterMixin, ids_tensor
 from .utils import require_torch, torch_device
 
 
@@ -33,7 +35,7 @@ if is_torch_available():
 
 
 @require_torch
-class DistilBertModelTest(CommonTestCases.CommonModelTester):
+class DistilBertModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
         (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
diff --git a/tests/test_modeling_gpt2.py b/tests/test_modeling_gpt2.py
index 2ca8f14d7a..55422d0ab7 100644
--- a/tests/test_modeling_gpt2.py
+++ b/tests/test_modeling_gpt2.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import unittest
+
 from transformers import is_torch_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_common import CommonTestCases, ids_tensor
+from .test_modeling_common import ModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
@@ -32,7 +34,7 @@ if is_torch_available():
 
 
 @require_torch
-class GPT2ModelTest(CommonTestCases.CommonModelTester):
+class GPT2ModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
 
diff --git a/tests/test_modeling_openai.py b/tests/test_modeling_openai.py
index d9b2d0f228..ccc187357f 100644
--- a/tests/test_modeling_openai.py
+++ b/tests/test_modeling_openai.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import unittest
+
 from transformers import is_torch_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_common import CommonTestCases, ids_tensor
+from .test_modeling_common import ModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
@@ -32,7 +34,7 @@ if is_torch_available():
 
 
 @require_torch
-class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
+class OpenAIGPTModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
         (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
diff --git a/tests/test_modeling_roberta.py b/tests/test_modeling_roberta.py
index 61d2055aea..1994405e07 100644
--- a/tests/test_modeling_roberta.py
+++ b/tests/test_modeling_roberta.py
@@ -19,7 +19,7 @@ import unittest
 from transformers import is_torch_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_common import CommonTestCases, ids_tensor
+from .test_modeling_common import ModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
@@ -37,7 +37,7 @@ if is_torch_available():
 
 
 @require_torch
-class RobertaModelTest(CommonTestCases.CommonModelTester):
+class RobertaModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else ()
 
diff --git a/tests/test_modeling_t5.py b/tests/test_modeling_t5.py
index f12fa10d3b..4c58a67972 100644
--- a/tests/test_modeling_t5.py
+++ b/tests/test_modeling_t5.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import unittest
+
 from transformers import is_torch_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_common import CommonTestCases, ids_tensor
+from .test_modeling_common import ModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow
 
 
@@ -27,7 +29,7 @@ if is_torch_available():
 
 
 @require_torch
-class T5ModelTest(CommonTestCases.CommonModelTester):
+class T5ModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (T5Model, T5WithLMHeadModel) if is_torch_available() else ()
     test_pruning = False
diff --git a/tests/test_modeling_tf_albert.py b/tests/test_modeling_tf_albert.py
index 3761c3252f..130d0de8b1 100644
--- a/tests/test_modeling_tf_albert.py
+++ b/tests/test_modeling_tf_albert.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import unittest
+
 from transformers import AlbertConfig, is_tf_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFCommonTestCases, ids_tensor
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
 
@@ -31,7 +33,7 @@ if is_tf_available():
 
 
 @require_tf
-class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
+class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
         (TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification) if is_tf_available() else ()
diff --git a/tests/test_modeling_tf_bert.py b/tests/test_modeling_tf_bert.py
index d93fd133ee..140a4f4db2 100644
--- a/tests/test_modeling_tf_bert.py
+++ b/tests/test_modeling_tf_bert.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import unittest
+
 from transformers import BertConfig, is_tf_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFCommonTestCases, ids_tensor
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
 
@@ -36,7 +38,7 @@ if is_tf_available():
 
 
 @require_tf
-class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
+class TFBertModelTest(TFModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
         (
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 868eb2d9e0..83f089eda9 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -20,7 +20,6 @@ import random
 import shutil
 import sys
 import tempfile
-import unittest
 
 from transformers import is_tf_available, is_torch_available
 
@@ -59,307 +58,300 @@ def _config_zero_init(config):
     return configs_no_init
 
 
-class TFCommonTestCases:
-    @require_tf
-    class TFCommonModelTester(unittest.TestCase):
+@require_tf
+class TFModelTesterMixin:
 
-        model_tester = None
-        all_model_classes = ()
-        test_torchscript = True
-        test_pruning = True
-        test_resize_embeddings = True
-        is_encoder_decoder = False
+    model_tester = None
+    all_model_classes = ()
+    test_torchscript = True
+    test_pruning = True
+    test_resize_embeddings = True
+    is_encoder_decoder = False
 
-        def test_initialization(self):
-            pass
-            # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+    def test_initialization(self):
+        pass
+        # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-            # configs_no_init = _config_zero_init(config)
-            # for model_class in self.all_model_classes:
-            #     model = model_class(config=configs_no_init)
-            #     for name, param in model.named_parameters():
-            #         if param.requires_grad:
-            #             self.assertIn(param.data.mean().item(), [0.0, 1.0],
-            #             msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
+        # configs_no_init = _config_zero_init(config)
+        # for model_class in self.all_model_classes:
+        #     model = model_class(config=configs_no_init)
+        #     for name, param in model.named_parameters():
+        #         if param.requires_grad:
+        #             self.assertIn(param.data.mean().item(), [0.0, 1.0],
+        #             msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
 
-        def test_save_load(self):
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+    def test_save_load(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-            for model_class in self.all_model_classes:
-                model = model_class(config)
-                outputs = model(inputs_dict)
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            outputs = model(inputs_dict)
 
-                with TemporaryDirectory() as tmpdirname:
-                    model.save_pretrained(tmpdirname)
-                    model = model_class.from_pretrained(tmpdirname)
-                    after_outputs = model(inputs_dict)
+            with TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model = model_class.from_pretrained(tmpdirname)
+                after_outputs = model(inputs_dict)
 
-                    # Make sure we don't have nans
-                    out_1 = after_outputs[0].numpy()
-                    out_2 = outputs[0].numpy()
-                    out_1 = out_1[~np.isnan(out_1)]
-                    out_2 = out_2[~np.isnan(out_2)]
-                    max_diff = np.amax(np.abs(out_1 - out_2))
-                    self.assertLessEqual(max_diff, 1e-5)
-
-        def test_pt_tf_model_equivalence(self):
-            if not is_torch_available():
-                return
-
-            import torch
-            import transformers
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            for model_class in self.all_model_classes:
-                pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beggining
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                config.output_hidden_states = True
-                tf_model = model_class(config)
-                pt_model = pt_model_class(config)
-
-                # Check we can load pt model in tf and vice-versa with model => model functions
-                tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=inputs_dict)
-                pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
-
-                # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
-                pt_model.eval()
-                pt_inputs_dict = dict(
-                    (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items()
-                )
-                with torch.no_grad():
-                    pto = pt_model(**pt_inputs_dict)
-                tfo = tf_model(inputs_dict, training=False)
-                tf_hidden_states = tfo[0].numpy()
-                pt_hidden_states = pto[0].numpy()
-                tf_hidden_states[np.isnan(tf_hidden_states)] = 0
-                pt_hidden_states[np.isnan(pt_hidden_states)] = 0
-                max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
-                self.assertLessEqual(max_diff, 2e-2)
-
-                # Check we can load pt model in tf and vice-versa with checkpoint => model functions
-                with TemporaryDirectory() as tmpdirname:
-                    pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
-                    torch.save(pt_model.state_dict(), pt_checkpoint_path)
-                    tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
-
-                    tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
-                    tf_model.save_weights(tf_checkpoint_path)
-                    pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
-
-                # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
-                pt_model.eval()
-                pt_inputs_dict = dict(
-                    (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items()
-                )
-                with torch.no_grad():
-                    pto = pt_model(**pt_inputs_dict)
-                tfo = tf_model(inputs_dict)
-                tfo = tfo[0].numpy()
-                pto = pto[0].numpy()
-                tfo[np.isnan(tfo)] = 0
-                pto[np.isnan(pto)] = 0
-                max_diff = np.amax(np.abs(tfo - pto))
-                self.assertLessEqual(max_diff, 2e-2)
-
-        def test_compile_tf_model(self):
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if self.is_encoder_decoder:
-                input_ids = {
-                    "decoder_input_ids": tf.keras.Input(
-                        batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"
-                    ),
-                    "encoder_input_ids": tf.keras.Input(
-                        batch_shape=(2, 2000), name="encoder_input_ids", dtype="int32"
-                    ),
-                }
-            else:
-                input_ids = tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32")
-            optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-            loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-            metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
-
-            for model_class in self.all_model_classes:
-                # Prepare our model
-                model = model_class(config)
-
-                # Let's load it from the disk to be sure we can use pretrained weights
-                with TemporaryDirectory() as tmpdirname:
-                    outputs = model(inputs_dict)  # build the model
-                    model.save_pretrained(tmpdirname)
-                    model = model_class.from_pretrained(tmpdirname)
-
-                outputs_dict = model(input_ids)
-                hidden_states = outputs_dict[0]
-
-                # Add a dense layer on top to test intetgration with other keras modules
-                outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
-
-                # Compile extended model
-                extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
-                extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
-
-        def test_keyword_and_dict_args(self):
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            for model_class in self.all_model_classes:
-                model = model_class(config)
-                outputs_dict = model(inputs_dict)
-
-                inputs_keywords = copy.deepcopy(inputs_dict)
-                input_ids = inputs_keywords.pop(
-                    "input_ids" if not self.is_encoder_decoder else "decoder_input_ids", None
-                )
-                outputs_keywords = model(input_ids, **inputs_keywords)
-
-                output_dict = outputs_dict[0].numpy()
-                output_keywords = outputs_keywords[0].numpy()
-
-                self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
-
-        def test_attention_outputs(self):
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            decoder_seq_length = (
-                self.model_tester.decoder_seq_length
-                if hasattr(self.model_tester, "decoder_seq_length")
-                else self.model_tester.seq_length
-            )
-            encoder_seq_length = (
-                self.model_tester.encoder_seq_length
-                if hasattr(self.model_tester, "encoder_seq_length")
-                else self.model_tester.seq_length
-            )
-            decoder_key_length = (
-                self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length
-            )
-            encoder_key_length = (
-                self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length
-            )
-
-            for model_class in self.all_model_classes:
-                config.output_attentions = True
-                config.output_hidden_states = False
-                model = model_class(config)
-                outputs = model(inputs_dict)
-                attentions = [t.numpy() for t in outputs[-1]]
-                self.assertEqual(model.config.output_attentions, True)
-                self.assertEqual(model.config.output_hidden_states, False)
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-                out_len = len(outputs)
-
-                if self.is_encoder_decoder:
-                    self.assertEqual(out_len % 2, 0)
-                    decoder_attentions = outputs[(out_len // 2) - 1]
-                    self.assertEqual(model.config.output_attentions, True)
-                    self.assertEqual(model.config.output_hidden_states, False)
-                    self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-                    self.assertListEqual(
-                        list(decoder_attentions[0].shape[-3:]),
-                        [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-                    )
-
-                # Check attention is always last and order is fine
-                config.output_attentions = True
-                config.output_hidden_states = True
-                model = model_class(config)
-                outputs = model(inputs_dict)
-                self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
-                self.assertEqual(model.config.output_attentions, True)
-                self.assertEqual(model.config.output_hidden_states, True)
-
-                attentions = [t.numpy() for t in outputs[-1]]
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-
-        def test_hidden_states_output(self):
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            for model_class in self.all_model_classes:
-                config.output_hidden_states = True
-                config.output_attentions = False
-                model = model_class(config)
-                outputs = model(inputs_dict)
-                hidden_states = [t.numpy() for t in outputs[-1]]
-                self.assertEqual(model.config.output_attentions, False)
-                self.assertEqual(model.config.output_hidden_states, True)
-                self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
-                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]), [self.model_tester.seq_length, self.model_tester.hidden_size]
-                )
-
-        def test_model_common_attributes(self):
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            for model_class in self.all_model_classes:
-                model = model_class(config)
-                assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-                x = model.get_output_embeddings()
-                assert x is None or isinstance(x, tf.keras.layers.Layer)
-
-        def test_determinism(self):
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            for model_class in self.all_model_classes:
-                model = model_class(config)
-                first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0]
-                out_1 = first.numpy()
-                out_2 = second.numpy()
+                # Make sure we don't have nans
+                out_1 = after_outputs[0].numpy()
+                out_2 = outputs[0].numpy()
                 out_1 = out_1[~np.isnan(out_1)]
                 out_2 = out_2[~np.isnan(out_2)]
                 max_diff = np.amax(np.abs(out_1 - out_2))
                 self.assertLessEqual(max_diff, 1e-5)
 
-        def _get_embeds(self, wte, input_ids):
-            # ^^ In our TF models, the input_embeddings can take slightly different forms,
-            # so we try a few of them.
-            # We used to fall back to just synthetically creating a dummy tensor of ones:
+    def test_pt_tf_model_equivalence(self):
+        if not is_torch_available():
+            return
+
+        import torch
+        import transformers
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beggining
+            pt_model_class = getattr(transformers, pt_model_class_name)
+
+            config.output_hidden_states = True
+            tf_model = model_class(config)
+            pt_model = pt_model_class(config)
+
+            # Check we can load pt model in tf and vice-versa with model => model functions
+            tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=inputs_dict)
+            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
+
+            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
+            pt_model.eval()
+            pt_inputs_dict = dict(
+                (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items()
+            )
+            with torch.no_grad():
+                pto = pt_model(**pt_inputs_dict)
+            tfo = tf_model(inputs_dict, training=False)
+            tf_hidden_states = tfo[0].numpy()
+            pt_hidden_states = pto[0].numpy()
+            tf_hidden_states[np.isnan(tf_hidden_states)] = 0
+            pt_hidden_states[np.isnan(pt_hidden_states)] = 0
+            max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
+            self.assertLessEqual(max_diff, 2e-2)
+
+            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
+            with TemporaryDirectory() as tmpdirname:
+                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
+                torch.save(pt_model.state_dict(), pt_checkpoint_path)
+                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
+
+                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
+                tf_model.save_weights(tf_checkpoint_path)
+                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
+
+            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
+            pt_model.eval()
+            pt_inputs_dict = dict(
+                (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items()
+            )
+            with torch.no_grad():
+                pto = pt_model(**pt_inputs_dict)
+            tfo = tf_model(inputs_dict)
+            tfo = tfo[0].numpy()
+            pto = pto[0].numpy()
+            tfo[np.isnan(tfo)] = 0
+            pto[np.isnan(pto)] = 0
+            max_diff = np.amax(np.abs(tfo - pto))
+            self.assertLessEqual(max_diff, 2e-2)
+
+    def test_compile_tf_model(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        if self.is_encoder_decoder:
+            input_ids = {
+                "decoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"),
+                "encoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="encoder_input_ids", dtype="int32"),
+            }
+        else:
+            input_ids = tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32")
+        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
+
+        for model_class in self.all_model_classes:
+            # Prepare our model
+            model = model_class(config)
+
+            # Let's load it from the disk to be sure we can use pretrained weights
+            with TemporaryDirectory() as tmpdirname:
+                outputs = model(inputs_dict)  # build the model
+                model.save_pretrained(tmpdirname)
+                model = model_class.from_pretrained(tmpdirname)
+
+            outputs_dict = model(input_ids)
+            hidden_states = outputs_dict[0]
+
+            # Add a dense layer on top to test intetgration with other keras modules
+            outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
+
+            # Compile extended model
+            extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
+            extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+
+    def test_keyword_and_dict_args(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            outputs_dict = model(inputs_dict)
+
+            inputs_keywords = copy.deepcopy(inputs_dict)
+            input_ids = inputs_keywords.pop("input_ids" if not self.is_encoder_decoder else "decoder_input_ids", None)
+            outputs_keywords = model(input_ids, **inputs_keywords)
+
+            output_dict = outputs_dict[0].numpy()
+            output_keywords = outputs_keywords[0].numpy()
+
+            self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        decoder_seq_length = (
+            self.model_tester.decoder_seq_length
+            if hasattr(self.model_tester, "decoder_seq_length")
+            else self.model_tester.seq_length
+        )
+        encoder_seq_length = (
+            self.model_tester.encoder_seq_length
+            if hasattr(self.model_tester, "encoder_seq_length")
+            else self.model_tester.seq_length
+        )
+        decoder_key_length = (
+            self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length
+        )
+        encoder_key_length = (
+            self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length
+        )
+
+        for model_class in self.all_model_classes:
+            config.output_attentions = True
+            config.output_hidden_states = False
+            model = model_class(config)
+            outputs = model(inputs_dict)
+            attentions = [t.numpy() for t in outputs[-1]]
+            self.assertEqual(model.config.output_attentions, True)
+            self.assertEqual(model.config.output_hidden_states, False)
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+            out_len = len(outputs)
+
+            if self.is_encoder_decoder:
+                self.assertEqual(out_len % 2, 0)
+                decoder_attentions = outputs[(out_len // 2) - 1]
+                self.assertEqual(model.config.output_attentions, True)
+                self.assertEqual(model.config.output_hidden_states, False)
+                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(decoder_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                )
+
+            # Check attention is always last and order is fine
+            config.output_attentions = True
+            config.output_hidden_states = True
+            model = model_class(config)
+            outputs = model(inputs_dict)
+            self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
+            self.assertEqual(model.config.output_attentions, True)
+            self.assertEqual(model.config.output_hidden_states, True)
+
+            attentions = [t.numpy() for t in outputs[-1]]
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+
+    def test_hidden_states_output(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            config.output_hidden_states = True
+            config.output_attentions = False
+            model = model_class(config)
+            outputs = model(inputs_dict)
+            hidden_states = [t.numpy() for t in outputs[-1]]
+            self.assertEqual(model.config.output_attentions, False)
+            self.assertEqual(model.config.output_hidden_states, True)
+            self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]), [self.model_tester.seq_length, self.model_tester.hidden_size]
+            )
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+            x = model.get_output_embeddings()
+            assert x is None or isinstance(x, tf.keras.layers.Layer)
+
+    def test_determinism(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0]
+            out_1 = first.numpy()
+            out_2 = second.numpy()
+            out_1 = out_1[~np.isnan(out_1)]
+            out_2 = out_2[~np.isnan(out_2)]
+            max_diff = np.amax(np.abs(out_1 - out_2))
+            self.assertLessEqual(max_diff, 1e-5)
+
+    def _get_embeds(self, wte, input_ids):
+        # ^^ In our TF models, the input_embeddings can take slightly different forms,
+        # so we try a few of them.
+        # We used to fall back to just synthetically creating a dummy tensor of ones:
+        try:
+            x = wte(input_ids, mode="embedding")
+        except Exception:
             try:
-                x = wte(input_ids, mode="embedding")
+                x = wte([input_ids], mode="embedding")
             except Exception:
                 try:
-                    x = wte([input_ids], mode="embedding")
+                    x = wte([input_ids, None, None, None], mode="embedding")
                 except Exception:
-                    try:
-                        x = wte([input_ids, None, None, None], mode="embedding")
-                    except Exception:
-                        if hasattr(self.model_tester, "embedding_size"):
-                            x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
-                        else:
-                            x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
-            return x
+                    if hasattr(self.model_tester, "embedding_size"):
+                        x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
+                    else:
+                        x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
+        return x
 
-        def test_inputs_embeds(self):
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.is_encoder_decoder:
+            input_ids = inputs_dict["input_ids"]
+            del inputs_dict["input_ids"]
+        else:
+            encoder_input_ids = inputs_dict["encoder_input_ids"]
+            decoder_input_ids = inputs_dict["decoder_input_ids"]
+            del inputs_dict["encoder_input_ids"]
+            del inputs_dict["decoder_input_ids"]
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            wte = model.get_input_embeddings()
             if not self.is_encoder_decoder:
-                input_ids = inputs_dict["input_ids"]
-                del inputs_dict["input_ids"]
+                inputs_dict["inputs_embeds"] = self._get_embeds(wte, input_ids)
             else:
-                encoder_input_ids = inputs_dict["encoder_input_ids"]
-                decoder_input_ids = inputs_dict["decoder_input_ids"]
-                del inputs_dict["encoder_input_ids"]
-                del inputs_dict["decoder_input_ids"]
+                inputs_dict["encoder_inputs_embeds"] = self._get_embeds(wte, encoder_input_ids)
+                inputs_dict["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids)
 
-            for model_class in self.all_model_classes:
-                model = model_class(config)
-
-                wte = model.get_input_embeddings()
-                if not self.is_encoder_decoder:
-                    inputs_dict["inputs_embeds"] = self._get_embeds(wte, input_ids)
-                else:
-                    inputs_dict["encoder_inputs_embeds"] = self._get_embeds(wte, encoder_input_ids)
-                    inputs_dict["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids)
-
-                outputs = model(inputs_dict)
+            outputs = model(inputs_dict)
 
 
 def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
diff --git a/tests/test_modeling_tf_ctrl.py b/tests/test_modeling_tf_ctrl.py
index 02845e60b5..4c41f417cc 100644
--- a/tests/test_modeling_tf_ctrl.py
+++ b/tests/test_modeling_tf_ctrl.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import unittest
+
 from transformers import CTRLConfig, is_tf_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFCommonTestCases, ids_tensor
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
 
@@ -26,7 +28,7 @@ if is_tf_available():
 
 
 @require_tf
-class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
+class TFCTRLModelTest(TFModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else ()
 
diff --git a/tests/test_modeling_tf_distilbert.py b/tests/test_modeling_tf_distilbert.py
index 3fa1ea849b..7dd0ad23e2 100644
--- a/tests/test_modeling_tf_distilbert.py
+++ b/tests/test_modeling_tf_distilbert.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import unittest
+
 from transformers import DistilBertConfig, is_tf_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFCommonTestCases, ids_tensor
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
 from .utils import require_tf
 
 
@@ -31,7 +33,7 @@ if is_tf_available():
 
 
 @require_tf
-class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
+class TFDistilBertModelTest(TFModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
         (
diff --git a/tests/test_modeling_tf_gpt2.py b/tests/test_modeling_tf_gpt2.py
index e97462258e..8c718a34f6 100644
--- a/tests/test_modeling_tf_gpt2.py
+++ b/tests/test_modeling_tf_gpt2.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import unittest
+
 from transformers import GPT2Config, is_tf_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFCommonTestCases, ids_tensor
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
 
@@ -32,7 +34,7 @@ if is_tf_available():
 
 
 @require_tf
-class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
+class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel) if is_tf_available() else ()
     # all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel) if is_tf_available() else ()
diff --git a/tests/test_modeling_tf_openai_gpt.py b/tests/test_modeling_tf_openai_gpt.py
index ea463c0e4f..3a624f1252 100644
--- a/tests/test_modeling_tf_openai_gpt.py
+++ b/tests/test_modeling_tf_openai_gpt.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import unittest
+
 from transformers import OpenAIGPTConfig, is_tf_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFCommonTestCases, ids_tensor
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
 
@@ -32,7 +34,7 @@ if is_tf_available():
 
 
 @require_tf
-class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
+class TFOpenAIGPTModelTest(TFModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
         (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel) if is_tf_available() else ()
diff --git a/tests/test_modeling_tf_roberta.py b/tests/test_modeling_tf_roberta.py
index 6318eeaabf..d924c5f5d7 100644
--- a/tests/test_modeling_tf_roberta.py
+++ b/tests/test_modeling_tf_roberta.py
@@ -19,7 +19,7 @@ import unittest
 from transformers import RobertaConfig, is_tf_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFCommonTestCases, ids_tensor
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
 
@@ -36,7 +36,7 @@ if is_tf_available():
 
 
 @require_tf
-class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
+class TFRobertaModelTest(TFModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
         (TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification) if is_tf_available() else ()
diff --git a/tests/test_modeling_tf_t5.py b/tests/test_modeling_tf_t5.py
index e18213c0b5..69272addda 100644
--- a/tests/test_modeling_tf_t5.py
+++ b/tests/test_modeling_tf_t5.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import unittest
+
 from transformers import T5Config, is_tf_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFCommonTestCases, ids_tensor
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
 
@@ -26,7 +28,7 @@ if is_tf_available():
 
 
 @require_tf
-class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
+class TFT5ModelTest(TFModelTesterMixin, unittest.TestCase):
 
     is_encoder_decoder = True
     all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if is_tf_available() else ()
diff --git a/tests/test_modeling_tf_transfo_xl.py b/tests/test_modeling_tf_transfo_xl.py
index bc2d616fe2..62f81c45a4 100644
--- a/tests/test_modeling_tf_transfo_xl.py
+++ b/tests/test_modeling_tf_transfo_xl.py
@@ -15,11 +15,12 @@
 from __future__ import absolute_import, division, print_function
 
 import random
+import unittest
 
 from transformers import TransfoXLConfig, is_tf_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFCommonTestCases, ids_tensor
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
 
@@ -33,7 +34,7 @@ if is_tf_available():
 
 
 @require_tf
-class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
+class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (TFTransfoXLModel, TFTransfoXLLMHeadModel) if is_tf_available() else ()
     test_pruning = False
diff --git a/tests/test_modeling_tf_xlm.py b/tests/test_modeling_tf_xlm.py
index 0ee924ab61..e62bae66c8 100644
--- a/tests/test_modeling_tf_xlm.py
+++ b/tests/test_modeling_tf_xlm.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import unittest
+
 from transformers import is_tf_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFCommonTestCases, ids_tensor
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
 
@@ -34,7 +36,7 @@ if is_tf_available():
 
 
 @require_tf
-class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
+class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
         (TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple)
diff --git a/tests/test_modeling_tf_xlnet.py b/tests/test_modeling_tf_xlnet.py
index 0dcd8a776a..7b7570f889 100644
--- a/tests/test_modeling_tf_xlnet.py
+++ b/tests/test_modeling_tf_xlnet.py
@@ -15,11 +15,12 @@
 from __future__ import absolute_import, division, print_function
 
 import random
+import unittest
 
 from transformers import XLNetConfig, is_tf_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFCommonTestCases, ids_tensor
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
 
@@ -37,7 +38,7 @@ if is_tf_available():
 
 
 @require_tf
-class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
+class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
         (
diff --git a/tests/test_modeling_transfo_xl.py b/tests/test_modeling_transfo_xl.py
index 76886a83ad..1e18aeb09b 100644
--- a/tests/test_modeling_transfo_xl.py
+++ b/tests/test_modeling_transfo_xl.py
@@ -15,11 +15,12 @@
 from __future__ import absolute_import, division, print_function
 
 import random
+import unittest
 
 from transformers import is_torch_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_common import CommonTestCases, ids_tensor
+from .test_modeling_common import ModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
@@ -30,7 +31,7 @@ if is_torch_available():
 
 
 @require_torch
-class TransfoXLModelTest(CommonTestCases.CommonModelTester):
+class TransfoXLModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel) if is_torch_available() else ()
     test_pruning = False
diff --git a/tests/test_modeling_xlm.py b/tests/test_modeling_xlm.py
index c921a4312f..22897249aa 100644
--- a/tests/test_modeling_xlm.py
+++ b/tests/test_modeling_xlm.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import unittest
+
 from transformers import is_torch_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_common import CommonTestCases, ids_tensor
+from .test_modeling_common import ModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
@@ -34,7 +36,7 @@ if is_torch_available():
 
 
 @require_torch
-class XLMModelTest(CommonTestCases.CommonModelTester):
+class XLMModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
         (
diff --git a/tests/test_modeling_xlnet.py b/tests/test_modeling_xlnet.py
index c295b79b28..e095a31e30 100644
--- a/tests/test_modeling_xlnet.py
+++ b/tests/test_modeling_xlnet.py
@@ -15,11 +15,12 @@
 from __future__ import absolute_import, division, print_function
 
 import random
+import unittest
 
 from transformers import is_torch_available
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_common import CommonTestCases, ids_tensor
+from .test_modeling_common import ModelTesterMixin, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
@@ -38,7 +39,7 @@ if is_torch_available():
 
 
 @require_torch
-class XLNetModelTest(CommonTestCases.CommonModelTester):
+class XLNetModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
         (