Expose get_config() on ModelTesters (#12812)

* Expose get_config() on ModelTesters * Typo
2021-07-21 10:13:11 +02:00
parent cabcc75171
commit c3d9ac7607
53 changed files with 1249 additions and 1193 deletions
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -22,6 +22,7 @@ from tests.test_modeling_common import floats_tensor
 from transformers import is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from transformers import {{cookiecutter.camelcase_modelname}}Config
 from .test_configuration_common import ConfigTester
 from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
@@ -30,7 +31,6 @@ if is_torch_available():
    import torch
    from transformers import (
        {{cookiecutter.camelcase_modelname}}Config,
        {{cookiecutter.camelcase_modelname}}ForCausalLM,
        {{cookiecutter.camelcase_modelname}}ForMaskedLM,
        {{cookiecutter.camelcase_modelname}}ForMultipleChoice,
@@ -112,7 +112,12 @@ class {{cookiecutter.camelcase_modelname}}ModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = {{cookiecutter.camelcase_modelname}}Config(
+        config = self.get_config()
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def get_config(self):
        return {{cookiecutter.camelcase_modelname}}Config(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -127,8 +132,6 @@ class {{cookiecutter.camelcase_modelname}}ModelTester:
            initializer_range=self.initializer_range,
        )
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def prepare_config_and_inputs_for_decoder(self):
        (
            config,
--- a/tests/test_modeling_albert.py
+++ b/tests/test_modeling_albert.py
@@ -16,7 +16,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import AlbertConfig, is_torch_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
@@ -29,7 +29,6 @@ if is_torch_available():
    from transformers import (
        MODEL_FOR_PRETRAINING_MAPPING,
        AlbertConfig,
        AlbertForMaskedLM,
        AlbertForMultipleChoice,
        AlbertForPreTraining,
@@ -90,7 +89,12 @@ class AlbertModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = AlbertConfig(
+        config = self.get_config()
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def get_config(self):
        return AlbertConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -105,8 +109,6 @@ class AlbertModelTester:
            num_hidden_groups=self.num_hidden_groups,
        )
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def create_and_check_model(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -21,7 +21,7 @@ import unittest
 import timeout_decorator  # noqa
-from transformers import is_torch_available
+from transformers import BartConfig, is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
@@ -35,7 +35,6 @@ if is_torch_available():
    from transformers import (
        AutoModelForSequenceClassification,
        BartConfig,
        BartForCausalLM,
        BartForConditionalGeneration,
        BartForQuestionAnswering,
@@ -78,7 +77,6 @@ def prepare_bart_inputs_dict(
    }
@require_torch
 class BartModelTester:
    def __init__(
        self,
@@ -127,7 +125,12 @@ class BartModelTester:
        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        config = BartConfig(
+        config = self.get_config()
        inputs_dict = prepare_bart_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
    def get_config(self):
        return BartConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
@@ -143,8 +146,6 @@ class BartModelTester:
            bos_token_id=self.bos_token_id,
            pad_token_id=self.pad_token_id,
        )
        inputs_dict = prepare_bart_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
    def prepare_config_and_inputs_for_common(self):
        config, inputs_dict = self.prepare_config_and_inputs()
--- a/tests/test_modeling_bert.py
+++ b/tests/test_modeling_bert.py
@@ -16,7 +16,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import BertConfig, is_torch_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
@@ -30,7 +30,6 @@ if is_torch_available():
    from transformers import (
        MODEL_FOR_PRETRAINING_MAPPING,
        BertConfig,
        BertForMaskedLM,
        BertForMultipleChoice,
        BertForNextSentencePrediction,
@@ -112,7 +111,15 @@ class BertModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = BertConfig(
+        config = self.get_config()
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def get_config(self):
        """
        Returns a tiny configuration by default.
        """
        return BertConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -127,8 +134,6 @@ class BertModelTester:
            initializer_range=self.initializer_range,
        )
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def prepare_config_and_inputs_for_decoder(self):
        (
            config,
--- a/tests/test_modeling_bert_generation.py
+++ b/tests/test_modeling_bert_generation.py
@@ -16,7 +16,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import BertGenerationConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -27,7 +27,7 @@ from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, r
 if is_torch_available():
    import torch
-    from transformers import BertGenerationConfig, BertGenerationDecoder, BertGenerationEncoder
+    from transformers import BertGenerationDecoder, BertGenerationEncoder
 class BertGenerationEncoderTester:
@@ -79,7 +79,12 @@ class BertGenerationEncoderTester:
        if self.use_labels:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        config = BertGenerationConfig(
+        config = self.get_config()
        return config, input_ids, input_mask, token_labels
    def get_config(self):
        return BertGenerationConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -93,8 +98,6 @@ class BertGenerationEncoderTester:
            initializer_range=self.initializer_range,
        )
        return config, input_ids, input_mask, token_labels
    def prepare_config_and_inputs_for_decoder(self):
        (
            config,
--- a/tests/test_modeling_big_bird.py
+++ b/tests/test_modeling_big_bird.py
@@ -18,7 +18,7 @@
 import unittest
 from tests.test_modeling_common import floats_tensor
-from transformers import is_torch_available
+from transformers import BigBirdConfig, is_torch_available
 from transformers.models.auto import get_values
 from transformers.models.big_bird.tokenization_big_bird import BigBirdTokenizer
 from transformers.testing_utils import require_torch, slow, torch_device
@@ -32,7 +32,6 @@ if is_torch_available():
    from transformers import (
        MODEL_FOR_PRETRAINING_MAPPING,
        BigBirdConfig,
        BigBirdForCausalLM,
        BigBirdForMaskedLM,
        BigBirdForMultipleChoice,
@@ -126,7 +125,12 @@ class BigBirdModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = BigBirdConfig(
+        config = self.get_config()
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def get_config(self):
        return BigBirdConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -147,8 +151,6 @@ class BigBirdModelTester:
            position_embedding_type=self.position_embedding_type,
        )
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def prepare_config_and_inputs_for_decoder(self):
        (
            config,
--- a/tests/test_modeling_bigbird_pegasus.py
+++ b/tests/test_modeling_bigbird_pegasus.py
@@ -19,7 +19,7 @@ import copy
 import tempfile
 import unittest
-from transformers import is_torch_available
+from transformers import BigBirdPegasusConfig, is_torch_available
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -31,7 +31,6 @@ if is_torch_available():
    import torch
    from transformers import (
        BigBirdPegasusConfig,
        BigBirdPegasusForCausalLM,
        BigBirdPegasusForConditionalGeneration,
        BigBirdPegasusForQuestionAnswering,
@@ -69,7 +68,6 @@ def prepare_bigbird_pegasus_inputs_dict(
    return input_dict
@require_torch
 class BigBirdPegasusModelTester:
    def __init__(
        self,
@@ -129,7 +127,12 @@ class BigBirdPegasusModelTester:
        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        config = BigBirdPegasusConfig(
+        config = self.get_config()
        inputs_dict = prepare_bigbird_pegasus_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
    def get_config(self):
        return BigBirdPegasusConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
@@ -150,8 +153,6 @@ class BigBirdPegasusModelTester:
            num_random_blocks=self.num_random_blocks,
            scale_embedding=self.scale_embedding,
        )
        inputs_dict = prepare_bigbird_pegasus_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
    def prepare_config_and_inputs_for_common(self):
        config, inputs_dict = self.prepare_config_and_inputs()
--- a/tests/test_modeling_blenderbot.py
+++ b/tests/test_modeling_blenderbot.py
@@ -17,7 +17,7 @@
 import tempfile
 import unittest
-from transformers import is_torch_available
+from transformers import BlenderbotConfig, is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
@@ -29,7 +29,7 @@ from .test_modeling_common import ModelTesterMixin, ids_tensor
 if is_torch_available():
    import torch
-    from transformers import BlenderbotConfig, BlenderbotForConditionalGeneration, BlenderbotModel, BlenderbotTokenizer
+    from transformers import BlenderbotForConditionalGeneration, BlenderbotModel, BlenderbotTokenizer
    from transformers.models.blenderbot.modeling_blenderbot import (
        BlenderbotDecoder,
        BlenderbotEncoder,
@@ -68,7 +68,6 @@ def prepare_blenderbot_inputs_dict(
    }
@require_torch
 class BlenderbotModelTester:
    def __init__(
        self,
@@ -109,7 +108,6 @@ class BlenderbotModelTester:
        self.bos_token_id = bos_token_id
    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
            3,
        )
@@ -117,7 +115,12 @@ class BlenderbotModelTester:
        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        config = BlenderbotConfig(
+        config = self.get_config()
        inputs_dict = prepare_blenderbot_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
    def get_config(self):
        return BlenderbotConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
@@ -133,8 +136,6 @@ class BlenderbotModelTester:
            bos_token_id=self.bos_token_id,
            pad_token_id=self.pad_token_id,
        )
        inputs_dict = prepare_blenderbot_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
    def prepare_config_and_inputs_for_common(self):
        config, inputs_dict = self.prepare_config_and_inputs()
--- a/tests/test_modeling_blenderbot_small.py
+++ b/tests/test_modeling_blenderbot_small.py
@@ -17,7 +17,7 @@
 import tempfile
 import unittest
-from transformers import is_torch_available
+from transformers import BlenderbotSmallConfig, is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_torch, slow, torch_device
@@ -29,12 +29,7 @@ from .test_modeling_common import ModelTesterMixin, ids_tensor
 if is_torch_available():
    import torch
-    from transformers import (
+    from transformers import BlenderbotSmallForConditionalGeneration, BlenderbotSmallModel, BlenderbotSmallTokenizer
        BlenderbotSmallConfig,
        BlenderbotSmallForConditionalGeneration,
        BlenderbotSmallModel,
        BlenderbotSmallTokenizer,
    )
    from transformers.models.blenderbot_small.modeling_blenderbot_small import (
        BlenderbotSmallDecoder,
        BlenderbotSmallEncoder,
@@ -73,7 +68,6 @@ def prepare_blenderbot_small_inputs_dict(
    }
@require_torch
 class BlenderbotSmallModelTester:
    def __init__(
        self,
@@ -114,7 +108,6 @@ class BlenderbotSmallModelTester:
        self.bos_token_id = bos_token_id
    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
            3,
        )
@@ -122,7 +115,12 @@ class BlenderbotSmallModelTester:
        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        config = BlenderbotSmallConfig(
+        config = self.get_config()
        inputs_dict = prepare_blenderbot_small_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
    def get_config(self):
        return BlenderbotSmallConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
@@ -138,8 +136,6 @@ class BlenderbotSmallModelTester:
            bos_token_id=self.bos_token_id,
            pad_token_id=self.pad_token_id,
        )
        inputs_dict = prepare_blenderbot_small_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
    def prepare_config_and_inputs_for_common(self):
        config, inputs_dict = self.prepare_config_and_inputs()
--- a/tests/test_modeling_canine.py
+++ b/tests/test_modeling_canine.py
@@ -18,7 +18,7 @@
 import unittest
 from typing import List, Tuple
-from transformers import is_torch_available
+from transformers import CanineConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -29,7 +29,6 @@ if is_torch_available():
    import torch
    from transformers import (
        CanineConfig,
        CanineForMultipleChoice,
        CanineForQuestionAnswering,
        CanineForSequenceClassification,
@@ -106,7 +105,12 @@ class CanineModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = CanineConfig(
+        config = self.get_config()
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def get_config(self):
        return CanineConfig(
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
            num_attention_heads=self.num_attention_heads,
@@ -120,8 +124,6 @@ class CanineModelTester:
            initializer_range=self.initializer_range,
        )
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def create_and_check_model(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
--- a/tests/test_modeling_clip.py
+++ b/tests/test_modeling_clip.py
@@ -21,6 +21,7 @@ import tempfile
 import unittest
 import requests
 from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
@@ -32,7 +33,7 @@ if is_torch_available():
    import torch
    from torch import nn
-    from transformers import CLIPConfig, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPVisionConfig, CLIPVisionModel
+    from transformers import CLIPModel, CLIPTextModel, CLIPVisionModel
    from transformers.models.clip.modeling_clip import CLIP_PRETRAINED_MODEL_ARCHIVE_LIST
@@ -77,7 +78,12 @@ class CLIPVisionModelTester:
    def prepare_config_and_inputs(self):
        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = CLIPVisionConfig(
+        config = self.get_config()
        return config, pixel_values
    def get_config(self):
        return CLIPVisionConfig(
            image_size=self.image_size,
            patch_size=self.patch_size,
            num_channels=self.num_channels,
@@ -90,8 +96,6 @@ class CLIPVisionModelTester:
            initializer_range=self.initializer_range,
        )
        return config, pixel_values
    def create_and_check_model(self, config, pixel_values):
        model = CLIPVisionModel(config=config)
        model.to(torch_device)
@@ -323,7 +327,12 @@ class CLIPTextModelTester:
        if self.use_input_mask:
            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-        config = CLIPTextConfig(
+        config = self.get_config()
        return config, input_ids, input_mask
    def get_config(self):
        return CLIPTextConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -335,8 +344,6 @@ class CLIPTextModelTester:
            initializer_range=self.initializer_range,
        )
        return config, input_ids, input_mask
    def create_and_check_model(self, config, input_ids, input_mask):
        model = CLIPTextModel(config=config)
        model.to(torch_device)
@@ -409,10 +416,15 @@ class CLIPModelTester:
        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-        config = CLIPConfig.from_text_vision_configs(text_config, vision_config, projection_dim=64)
+        config = self.get_config()
        return config, input_ids, attention_mask, pixel_values
    def get_config(self):
        return CLIPConfig.from_text_vision_configs(
            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
        )
    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
        model = CLIPModel(config).to(torch_device).eval()
        result = model(input_ids, pixel_values, attention_mask)
--- a/tests/test_modeling_convbert.py
+++ b/tests/test_modeling_convbert.py
@@ -18,7 +18,7 @@
 import unittest
 from tests.test_modeling_common import floats_tensor
-from transformers import is_torch_available
+from transformers import ConvBertConfig, is_torch_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
@@ -31,7 +31,6 @@ if is_torch_available():
    from transformers import (
        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
        ConvBertConfig,
        ConvBertForMaskedLM,
        ConvBertForMultipleChoice,
        ConvBertForQuestionAnswering,
@@ -110,7 +109,12 @@ class ConvBertModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = ConvBertConfig(
+        config = self.get_config()
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def get_config(self):
        return ConvBertConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -125,8 +129,6 @@ class ConvBertModelTester:
            initializer_range=self.initializer_range,
        )
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def prepare_config_and_inputs_for_decoder(self):
        (
            config,
--- a/tests/test_modeling_ctrl.py
+++ b/tests/test_modeling_ctrl.py
@@ -15,7 +15,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import CTRLConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -28,7 +28,6 @@ if is_torch_available():
    from transformers import (
        CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
        CTRLConfig,
        CTRLForSequenceClassification,
        CTRLLMHeadModel,
        CTRLModel,
@@ -88,21 +87,7 @@ class CTRLModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = CTRLConfig(
+        config = self.get_config()
            vocab_size=self.vocab_size,
            n_embd=self.hidden_size,
            n_layer=self.num_hidden_layers,
            n_head=self.num_attention_heads,
            # intermediate_size=self.intermediate_size,
            # hidden_act=self.hidden_act,
            # hidden_dropout_prob=self.hidden_dropout_prob,
            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            n_positions=self.max_position_embeddings,
            n_ctx=self.max_position_embeddings,
            # type_vocab_size=self.type_vocab_size,
            # initializer_range=self.initializer_range,
            pad_token_id=self.pad_token_id,
        )
        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
@@ -118,6 +103,23 @@ class CTRLModelTester:
            choice_labels,
        )
    def get_config(self):
        return CTRLConfig(
            vocab_size=self.vocab_size,
            n_embd=self.hidden_size,
            n_layer=self.num_hidden_layers,
            n_head=self.num_attention_heads,
            # intermediate_size=self.intermediate_size,
            # hidden_act=self.hidden_act,
            # hidden_dropout_prob=self.hidden_dropout_prob,
            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            n_positions=self.max_position_embeddings,
            n_ctx=self.max_position_embeddings,
            # type_vocab_size=self.type_vocab_size,
            # initializer_range=self.initializer_range,
            pad_token_id=self.pad_token_id,
        )
    def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
        model = CTRLModel(config=config)
        model.to(torch_device)
--- a/tests/test_modeling_deberta.py
+++ b/tests/test_modeling_deberta.py
@@ -12,10 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
-from transformers import is_torch_available
+from transformers import DebertaConfig, is_torch_available
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -26,7 +25,6 @@ if is_torch_available():
    import torch
    from transformers import (
        DebertaConfig,
        DebertaForMaskedLM,
        DebertaForQuestionAnswering,
        DebertaForSequenceClassification,
@@ -36,6 +34,179 @@ if is_torch_available():
    from transformers.models.deberta.modeling_deberta import DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST
 class DebertaModelTester(object):
    def __init__(
        self,
        parent,
        batch_size=13,
        seq_length=7,
        is_training=True,
        use_input_mask=True,
        use_token_type_ids=True,
        use_labels=True,
        vocab_size=99,
        hidden_size=32,
        num_hidden_layers=5,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=16,
        type_sequence_label_size=2,
        initializer_range=0.02,
        relative_attention=False,
        position_biased_input=True,
        pos_att_type="None",
        num_labels=3,
        num_choices=4,
        scope=None,
    ):
        self.parent = parent
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.is_training = is_training
        self.use_input_mask = use_input_mask
        self.use_token_type_ids = use_token_type_ids
        self.use_labels = use_labels
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.type_sequence_label_size = type_sequence_label_size
        self.initializer_range = initializer_range
        self.num_labels = num_labels
        self.num_choices = num_choices
        self.relative_attention = relative_attention
        self.position_biased_input = position_biased_input
        self.pos_att_type = pos_att_type
        self.scope = scope
    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
        input_mask = None
        if self.use_input_mask:
            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
        token_type_ids = None
        if self.use_token_type_ids:
            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
        sequence_labels = None
        token_labels = None
        choice_labels = None
        if self.use_labels:
            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
        config = self.get_config()
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def get_config(self):
        return DebertaConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
            num_attention_heads=self.num_attention_heads,
            intermediate_size=self.intermediate_size,
            hidden_act=self.hidden_act,
            hidden_dropout_prob=self.hidden_dropout_prob,
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            max_position_embeddings=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
            initializer_range=self.initializer_range,
            relative_attention=self.relative_attention,
            position_biased_input=self.position_biased_input,
            pos_att_type=self.pos_att_type,
        )
    def check_loss_output(self, result):
        self.parent.assertListEqual(list(result.loss.size()), [])
    def create_and_check_deberta_model(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
        model = DebertaModel(config=config)
        model.to(torch_device)
        model.eval()
        sequence_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0]
        sequence_output = model(input_ids, token_type_ids=token_type_ids)[0]
        sequence_output = model(input_ids)[0]
        self.parent.assertListEqual(list(sequence_output.size()), [self.batch_size, self.seq_length, self.hidden_size])
    def create_and_check_deberta_for_masked_lm(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
        model = DebertaForMaskedLM(config=config)
        model.to(torch_device)
        model.eval()
        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
    def create_and_check_deberta_for_sequence_classification(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
        config.num_labels = self.num_labels
        model = DebertaForSequenceClassification(config)
        model.to(torch_device)
        model.eval()
        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
        self.parent.assertListEqual(list(result.logits.size()), [self.batch_size, self.num_labels])
        self.check_loss_output(result)
    def create_and_check_deberta_for_token_classification(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
        config.num_labels = self.num_labels
        model = DebertaForTokenClassification(config=config)
        model.to(torch_device)
        model.eval()
        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
    def create_and_check_deberta_for_question_answering(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
        model = DebertaForQuestionAnswering(config=config)
        model.to(torch_device)
        model.eval()
        result = model(
            input_ids,
            attention_mask=input_mask,
            token_type_ids=token_type_ids,
            start_positions=sequence_labels,
            end_positions=sequence_labels,
        )
        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        (
            config,
            input_ids,
            token_type_ids,
            input_mask,
            sequence_labels,
            token_labels,
            choice_labels,
        ) = config_and_inputs
        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
        return config, inputs_dict
@require_torch
 class DebertaModelTest(ModelTesterMixin, unittest.TestCase):
@@ -56,179 +227,8 @@ class DebertaModelTest(ModelTesterMixin, unittest.TestCase):
    test_head_masking = False
    is_encoder_decoder = False
    class DebertaModelTester(object):
        def __init__(
            self,
            parent,
            batch_size=13,
            seq_length=7,
            is_training=True,
            use_input_mask=True,
            use_token_type_ids=True,
            use_labels=True,
            vocab_size=99,
            hidden_size=32,
            num_hidden_layers=5,
            num_attention_heads=4,
            intermediate_size=37,
            hidden_act="gelu",
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1,
            max_position_embeddings=512,
            type_vocab_size=16,
            type_sequence_label_size=2,
            initializer_range=0.02,
            relative_attention=False,
            position_biased_input=True,
            pos_att_type="None",
            num_labels=3,
            num_choices=4,
            scope=None,
        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
            self.is_training = is_training
            self.use_input_mask = use_input_mask
            self.use_token_type_ids = use_token_type_ids
            self.use_labels = use_labels
            self.vocab_size = vocab_size
            self.hidden_size = hidden_size
            self.num_hidden_layers = num_hidden_layers
            self.num_attention_heads = num_attention_heads
            self.intermediate_size = intermediate_size
            self.hidden_act = hidden_act
            self.hidden_dropout_prob = hidden_dropout_prob
            self.attention_probs_dropout_prob = attention_probs_dropout_prob
            self.max_position_embeddings = max_position_embeddings
            self.type_vocab_size = type_vocab_size
            self.type_sequence_label_size = type_sequence_label_size
            self.initializer_range = initializer_range
            self.num_labels = num_labels
            self.num_choices = num_choices
            self.relative_attention = relative_attention
            self.position_biased_input = position_biased_input
            self.pos_att_type = pos_att_type
            self.scope = scope
        def prepare_config_and_inputs(self):
            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
            input_mask = None
            if self.use_input_mask:
                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
            token_type_ids = None
            if self.use_token_type_ids:
                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
            sequence_labels = None
            token_labels = None
            choice_labels = None
            if self.use_labels:
                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = DebertaConfig(
                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
                intermediate_size=self.intermediate_size,
                hidden_act=self.hidden_act,
                hidden_dropout_prob=self.hidden_dropout_prob,
                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
                type_vocab_size=self.type_vocab_size,
                initializer_range=self.initializer_range,
                relative_attention=self.relative_attention,
                position_biased_input=self.position_biased_input,
                pos_att_type=self.pos_att_type,
            )
            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
        def check_loss_output(self, result):
            self.parent.assertListEqual(list(result.loss.size()), [])
        def create_and_check_deberta_model(
            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
        ):
            model = DebertaModel(config=config)
            model.to(torch_device)
            model.eval()
            sequence_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0]
            sequence_output = model(input_ids, token_type_ids=token_type_ids)[0]
            sequence_output = model(input_ids)[0]
            self.parent.assertListEqual(
                list(sequence_output.size()), [self.batch_size, self.seq_length, self.hidden_size]
            )
        def create_and_check_deberta_for_masked_lm(
            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
        ):
            model = DebertaForMaskedLM(config=config)
            model.to(torch_device)
            model.eval()
            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
        def create_and_check_deberta_for_sequence_classification(
            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
        ):
            config.num_labels = self.num_labels
            model = DebertaForSequenceClassification(config)
            model.to(torch_device)
            model.eval()
            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
            self.parent.assertListEqual(list(result.logits.size()), [self.batch_size, self.num_labels])
            self.check_loss_output(result)
        def create_and_check_deberta_for_token_classification(
            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
        ):
            config.num_labels = self.num_labels
            model = DebertaForTokenClassification(config=config)
            model.to(torch_device)
            model.eval()
            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
        def create_and_check_deberta_for_question_answering(
            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
        ):
            model = DebertaForQuestionAnswering(config=config)
            model.to(torch_device)
            model.eval()
            result = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=token_type_ids,
                start_positions=sequence_labels,
                end_positions=sequence_labels,
            )
            self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
            self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
            (
                config,
                input_ids,
                token_type_ids,
                input_mask,
                sequence_labels,
                token_labels,
                choice_labels,
            ) = config_and_inputs
            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
            return config, inputs_dict
    def setUp(self):
-        self.model_tester = DebertaModelTest.DebertaModelTester(self)
+        self.model_tester = DebertaModelTester(self)
        self.config_tester = ConfigTester(self, config_class=DebertaConfig, hidden_size=37)
    def test_config(self):
--- a/tests/test_modeling_deberta_v2.py
+++ b/tests/test_modeling_deberta_v2.py
@@ -12,10 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
-from transformers import is_torch_available
+from transformers import DebertaV2Config, is_torch_available
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -26,7 +25,6 @@ if is_torch_available():
    import torch
    from transformers import (
        DebertaV2Config,
        DebertaV2ForMaskedLM,
        DebertaV2ForQuestionAnswering,
        DebertaV2ForSequenceClassification,
@@ -36,6 +34,179 @@ if is_torch_available():
    from transformers.models.deberta_v2.modeling_deberta_v2 import DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST
 class DebertaV2ModelTester(object):
    def __init__(
        self,
        parent,
        batch_size=13,
        seq_length=7,
        is_training=True,
        use_input_mask=True,
        use_token_type_ids=True,
        use_labels=True,
        vocab_size=99,
        hidden_size=32,
        num_hidden_layers=5,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=16,
        type_sequence_label_size=2,
        initializer_range=0.02,
        relative_attention=False,
        position_biased_input=True,
        pos_att_type="None",
        num_labels=3,
        num_choices=4,
        scope=None,
    ):
        self.parent = parent
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.is_training = is_training
        self.use_input_mask = use_input_mask
        self.use_token_type_ids = use_token_type_ids
        self.use_labels = use_labels
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.type_sequence_label_size = type_sequence_label_size
        self.initializer_range = initializer_range
        self.num_labels = num_labels
        self.num_choices = num_choices
        self.relative_attention = relative_attention
        self.position_biased_input = position_biased_input
        self.pos_att_type = pos_att_type
        self.scope = scope
    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
        input_mask = None
        if self.use_input_mask:
            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
        token_type_ids = None
        if self.use_token_type_ids:
            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
        sequence_labels = None
        token_labels = None
        choice_labels = None
        if self.use_labels:
            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
        config = self.get_config()
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def get_config(self):
        return DebertaV2Config(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
            num_attention_heads=self.num_attention_heads,
            intermediate_size=self.intermediate_size,
            hidden_act=self.hidden_act,
            hidden_dropout_prob=self.hidden_dropout_prob,
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            max_position_embeddings=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
            initializer_range=self.initializer_range,
            relative_attention=self.relative_attention,
            position_biased_input=self.position_biased_input,
            pos_att_type=self.pos_att_type,
        )
    def check_loss_output(self, result):
        self.parent.assertListEqual(list(result.loss.size()), [])
    def create_and_check_deberta_model(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
        model = DebertaV2Model(config=config)
        model.to(torch_device)
        model.eval()
        sequence_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0]
        sequence_output = model(input_ids, token_type_ids=token_type_ids)[0]
        sequence_output = model(input_ids)[0]
        self.parent.assertListEqual(list(sequence_output.size()), [self.batch_size, self.seq_length, self.hidden_size])
    def create_and_check_deberta_for_masked_lm(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
        model = DebertaV2ForMaskedLM(config=config)
        model.to(torch_device)
        model.eval()
        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
    def create_and_check_deberta_for_sequence_classification(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
        config.num_labels = self.num_labels
        model = DebertaV2ForSequenceClassification(config)
        model.to(torch_device)
        model.eval()
        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
        self.parent.assertListEqual(list(result.logits.size()), [self.batch_size, self.num_labels])
        self.check_loss_output(result)
    def create_and_check_deberta_for_token_classification(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
        config.num_labels = self.num_labels
        model = DebertaV2ForTokenClassification(config=config)
        model.to(torch_device)
        model.eval()
        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
    def create_and_check_deberta_for_question_answering(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
        model = DebertaV2ForQuestionAnswering(config=config)
        model.to(torch_device)
        model.eval()
        result = model(
            input_ids,
            attention_mask=input_mask,
            token_type_ids=token_type_ids,
            start_positions=sequence_labels,
            end_positions=sequence_labels,
        )
        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        (
            config,
            input_ids,
            token_type_ids,
            input_mask,
            sequence_labels,
            token_labels,
            choice_labels,
        ) = config_and_inputs
        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
        return config, inputs_dict
@require_torch
 class DebertaV2ModelTest(ModelTesterMixin, unittest.TestCase):
@@ -56,179 +227,8 @@ class DebertaV2ModelTest(ModelTesterMixin, unittest.TestCase):
    test_head_masking = False
    is_encoder_decoder = False
    class DebertaV2ModelTester(object):
        def __init__(
            self,
            parent,
            batch_size=13,
            seq_length=7,
            is_training=True,
            use_input_mask=True,
            use_token_type_ids=True,
            use_labels=True,
            vocab_size=99,
            hidden_size=32,
            num_hidden_layers=5,
            num_attention_heads=4,
            intermediate_size=37,
            hidden_act="gelu",
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1,
            max_position_embeddings=512,
            type_vocab_size=16,
            type_sequence_label_size=2,
            initializer_range=0.02,
            relative_attention=False,
            position_biased_input=True,
            pos_att_type="None",
            num_labels=3,
            num_choices=4,
            scope=None,
        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
            self.is_training = is_training
            self.use_input_mask = use_input_mask
            self.use_token_type_ids = use_token_type_ids
            self.use_labels = use_labels
            self.vocab_size = vocab_size
            self.hidden_size = hidden_size
            self.num_hidden_layers = num_hidden_layers
            self.num_attention_heads = num_attention_heads
            self.intermediate_size = intermediate_size
            self.hidden_act = hidden_act
            self.hidden_dropout_prob = hidden_dropout_prob
            self.attention_probs_dropout_prob = attention_probs_dropout_prob
            self.max_position_embeddings = max_position_embeddings
            self.type_vocab_size = type_vocab_size
            self.type_sequence_label_size = type_sequence_label_size
            self.initializer_range = initializer_range
            self.num_labels = num_labels
            self.num_choices = num_choices
            self.relative_attention = relative_attention
            self.position_biased_input = position_biased_input
            self.pos_att_type = pos_att_type
            self.scope = scope
        def prepare_config_and_inputs(self):
            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
            input_mask = None
            if self.use_input_mask:
                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
            token_type_ids = None
            if self.use_token_type_ids:
                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
            sequence_labels = None
            token_labels = None
            choice_labels = None
            if self.use_labels:
                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = DebertaV2Config(
                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
                intermediate_size=self.intermediate_size,
                hidden_act=self.hidden_act,
                hidden_dropout_prob=self.hidden_dropout_prob,
                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
                type_vocab_size=self.type_vocab_size,
                initializer_range=self.initializer_range,
                relative_attention=self.relative_attention,
                position_biased_input=self.position_biased_input,
                pos_att_type=self.pos_att_type,
            )
            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
        def check_loss_output(self, result):
            self.parent.assertListEqual(list(result.loss.size()), [])
        def create_and_check_deberta_model(
            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
        ):
            model = DebertaV2Model(config=config)
            model.to(torch_device)
            model.eval()
            sequence_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0]
            sequence_output = model(input_ids, token_type_ids=token_type_ids)[0]
            sequence_output = model(input_ids)[0]
            self.parent.assertListEqual(
                list(sequence_output.size()), [self.batch_size, self.seq_length, self.hidden_size]
            )
        def create_and_check_deberta_for_masked_lm(
            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
        ):
            model = DebertaV2ForMaskedLM(config=config)
            model.to(torch_device)
            model.eval()
            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
        def create_and_check_deberta_for_sequence_classification(
            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
        ):
            config.num_labels = self.num_labels
            model = DebertaV2ForSequenceClassification(config)
            model.to(torch_device)
            model.eval()
            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
            self.parent.assertListEqual(list(result.logits.size()), [self.batch_size, self.num_labels])
            self.check_loss_output(result)
        def create_and_check_deberta_for_token_classification(
            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
        ):
            config.num_labels = self.num_labels
            model = DebertaV2ForTokenClassification(config=config)
            model.to(torch_device)
            model.eval()
            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
        def create_and_check_deberta_for_question_answering(
            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
        ):
            model = DebertaV2ForQuestionAnswering(config=config)
            model.to(torch_device)
            model.eval()
            result = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=token_type_ids,
                start_positions=sequence_labels,
                end_positions=sequence_labels,
            )
            self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
            self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
            (
                config,
                input_ids,
                token_type_ids,
                input_mask,
                sequence_labels,
                token_labels,
                choice_labels,
            ) = config_and_inputs
            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
            return config, inputs_dict
    def setUp(self):
-        self.model_tester = DebertaV2ModelTest.DebertaV2ModelTester(self)
+        self.model_tester = DebertaV2ModelTester(self)
        self.config_tester = ConfigTester(self, config_class=DebertaV2Config, hidden_size=37)
    def test_config(self):
--- a/tests/test_modeling_deit.py
+++ b/tests/test_modeling_deit.py
@@ -18,6 +18,7 @@
 import inspect
 import unittest
 from transformers import DeiTConfig
 from transformers.file_utils import cached_property, is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
@@ -31,7 +32,6 @@ if is_torch_available():
    from transformers import (
        MODEL_MAPPING,
        DeiTConfig,
        DeiTForImageClassification,
        DeiTForImageClassificationWithTeacher,
        DeiTModel,
@@ -92,7 +92,12 @@ class DeiTModelTester:
        if self.use_labels:
            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-        config = DeiTConfig(
+        config = self.get_config()
        return config, pixel_values, labels
    def get_config(self):
        return DeiTConfig(
            image_size=self.image_size,
            patch_size=self.patch_size,
            num_channels=self.num_channels,
@@ -107,8 +112,6 @@ class DeiTModelTester:
            initializer_range=self.initializer_range,
        )
        return config, pixel_values, labels
    def create_and_check_model(self, config, pixel_values, labels):
        model = DeiTModel(config=config)
        model.to(torch_device)
--- a/tests/test_modeling_detr.py
+++ b/tests/test_modeling_detr.py
@@ -19,7 +19,7 @@ import inspect
 import math
 import unittest
-from transformers import is_timm_available, is_vision_available
+from transformers import DetrConfig, is_timm_available, is_vision_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_timm, require_vision, slow, torch_device
@@ -31,7 +31,7 @@ from .test_modeling_common import ModelTesterMixin, _config_zero_init, floats_te
 if is_timm_available():
    import torch
-    from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrModel
+    from transformers import DetrForObjectDetection, DetrForSegmentation, DetrModel
 if is_vision_available():
@@ -40,7 +40,6 @@ if is_vision_available():
    from transformers import DetrFeatureExtractor
@require_timm
 class DetrModelTester:
    def __init__(
        self,
@@ -102,7 +101,11 @@ class DetrModelTester:
                target["masks"] = torch.rand(self.n_targets, self.min_size, self.max_size, device=torch_device)
                labels.append(target)
-        config = DetrConfig(
+        config = self.get_config()
        return config, pixel_values, pixel_mask, labels
    def get_config(self):
        return DetrConfig(
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
            decoder_layers=self.num_hidden_layers,
@@ -115,7 +118,6 @@ class DetrModelTester:
            num_queries=self.num_queries,
            num_labels=self.num_labels,
        )
        return config, pixel_values, pixel_mask, labels
    def prepare_config_and_inputs_for_common(self):
        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs()
--- a/tests/test_modeling_distilbert.py
+++ b/tests/test_modeling_distilbert.py
@@ -16,7 +16,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import DistilBertConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -28,7 +28,6 @@ if is_torch_available():
    from transformers import (
        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
        DistilBertConfig,
        DistilBertForMaskedLM,
        DistilBertForMultipleChoice,
        DistilBertForQuestionAnswering,
@@ -37,160 +36,162 @@ if is_torch_available():
        DistilBertModel,
    )
    class DistilBertModelTester(object):
        def __init__(
            self,
            parent,
            batch_size=13,
            seq_length=7,
            is_training=True,
            use_input_mask=True,
            use_token_type_ids=False,
            use_labels=True,
            vocab_size=99,
            hidden_size=32,
            num_hidden_layers=5,
            num_attention_heads=4,
            intermediate_size=37,
            hidden_act="gelu",
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1,
            max_position_embeddings=512,
            type_vocab_size=16,
            type_sequence_label_size=2,
            initializer_range=0.02,
            num_labels=3,
            num_choices=4,
            scope=None,
        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
            self.is_training = is_training
            self.use_input_mask = use_input_mask
            self.use_token_type_ids = use_token_type_ids
            self.use_labels = use_labels
            self.vocab_size = vocab_size
            self.hidden_size = hidden_size
            self.num_hidden_layers = num_hidden_layers
            self.num_attention_heads = num_attention_heads
            self.intermediate_size = intermediate_size
            self.hidden_act = hidden_act
            self.hidden_dropout_prob = hidden_dropout_prob
            self.attention_probs_dropout_prob = attention_probs_dropout_prob
            self.max_position_embeddings = max_position_embeddings
            self.type_vocab_size = type_vocab_size
            self.type_sequence_label_size = type_sequence_label_size
            self.initializer_range = initializer_range
            self.num_labels = num_labels
            self.num_choices = num_choices
            self.scope = scope
-        def prepare_config_and_inputs(self):
+class DistilBertModelTester(object):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+    def __init__(
        self,
        parent,
        batch_size=13,
        seq_length=7,
        is_training=True,
        use_input_mask=True,
        use_token_type_ids=False,
        use_labels=True,
        vocab_size=99,
        hidden_size=32,
        num_hidden_layers=5,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=16,
        type_sequence_label_size=2,
        initializer_range=0.02,
        num_labels=3,
        num_choices=4,
        scope=None,
    ):
        self.parent = parent
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.is_training = is_training
        self.use_input_mask = use_input_mask
        self.use_token_type_ids = use_token_type_ids
        self.use_labels = use_labels
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.type_sequence_label_size = type_sequence_label_size
        self.initializer_range = initializer_range
        self.num_labels = num_labels
        self.num_choices = num_choices
        self.scope = scope
-            input_mask = None
+    def prepare_config_and_inputs(self):
-            if self.use_input_mask:
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
                input_mask = random_attention_mask([self.batch_size, self.seq_length])
-            sequence_labels = None
+        input_mask = None
-            token_labels = None
+        if self.use_input_mask:
-            choice_labels = None
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
            if self.use_labels:
                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-            config = DistilBertConfig(
+        sequence_labels = None
-                vocab_size=self.vocab_size,
+        token_labels = None
-                dim=self.hidden_size,
+        choice_labels = None
-                n_layers=self.num_hidden_layers,
+        if self.use_labels:
-                n_heads=self.num_attention_heads,
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                hidden_dim=self.intermediate_size,
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                hidden_act=self.hidden_act,
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
                dropout=self.hidden_dropout_prob,
                attention_dropout=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
                initializer_range=self.initializer_range,
            )
-            return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        config = self.get_config()
-        def create_and_check_distilbert_model(
+        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
        ):
            model = DistilBertModel(config=config)
            model.to(torch_device)
            model.eval()
            result = model(input_ids, input_mask)
            result = model(input_ids)
            self.parent.assertEqual(
                result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)
            )
-        def create_and_check_distilbert_for_masked_lm(
+    def get_config(self):
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        return DistilBertConfig(
-        ):
+            vocab_size=self.vocab_size,
-            model = DistilBertForMaskedLM(config=config)
+            dim=self.hidden_size,
-            model.to(torch_device)
+            n_layers=self.num_hidden_layers,
-            model.eval()
+            n_heads=self.num_attention_heads,
-            result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+            hidden_dim=self.intermediate_size,
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+            hidden_act=self.hidden_act,
            dropout=self.hidden_dropout_prob,
            attention_dropout=self.attention_probs_dropout_prob,
            max_position_embeddings=self.max_position_embeddings,
            initializer_range=self.initializer_range,
        )
-        def create_and_check_distilbert_for_question_answering(
+    def create_and_check_distilbert_model(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
+    ):
-            model = DistilBertForQuestionAnswering(config=config)
+        model = DistilBertModel(config=config)
-            model.to(torch_device)
+        model.to(torch_device)
-            model.eval()
+        model.eval()
-            result = model(
+        result = model(input_ids, input_mask)
-                input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels
+        result = model(input_ids)
-            )
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
            self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
            self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-        def create_and_check_distilbert_for_sequence_classification(
+    def create_and_check_distilbert_for_masked_lm(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
+    ):
-            config.num_labels = self.num_labels
+        model = DistilBertForMaskedLM(config=config)
-            model = DistilBertForSequenceClassification(config)
+        model.to(torch_device)
-            model.to(torch_device)
+        model.eval()
-            model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-            result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-        def create_and_check_distilbert_for_token_classification(
+    def create_and_check_distilbert_for_question_answering(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
+    ):
-            config.num_labels = self.num_labels
+        model = DistilBertForQuestionAnswering(config=config)
-            model = DistilBertForTokenClassification(config=config)
+        model.to(torch_device)
-            model.to(torch_device)
+        model.eval()
-            model.eval()
+        result = model(
            input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels
        )
        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-            result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+    def create_and_check_distilbert_for_sequence_classification(
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
        config.num_labels = self.num_labels
        model = DistilBertForSequenceClassification(config)
        model.to(torch_device)
        model.eval()
        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-        def create_and_check_distilbert_for_multiple_choice(
+    def create_and_check_distilbert_for_token_classification(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
+    ):
-            config.num_choices = self.num_choices
+        config.num_labels = self.num_labels
-            model = DistilBertForMultipleChoice(config=config)
+        model = DistilBertForTokenClassification(config=config)
-            model.to(torch_device)
+        model.to(torch_device)
-            model.eval()
+        model.eval()
            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
            result = model(
                multiple_choice_inputs_ids,
                attention_mask=multiple_choice_input_mask,
                labels=choice_labels,
            )
            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-        def prepare_config_and_inputs_for_common(self):
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-            config_and_inputs = self.prepare_config_and_inputs()
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-            (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
+
-            inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+    def create_and_check_distilbert_for_multiple_choice(
-            return config, inputs_dict
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
        config.num_choices = self.num_choices
        model = DistilBertForMultipleChoice(config=config)
        model.to(torch_device)
        model.eval()
        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
        result = model(
            multiple_choice_inputs_ids,
            attention_mask=multiple_choice_input_mask,
            labels=choice_labels,
        )
        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
        return config, inputs_dict
@require_torch
--- a/tests/test_modeling_dpr.py
+++ b/tests/test_modeling_dpr.py
@@ -16,7 +16,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import DPRConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -26,7 +26,7 @@ from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention
 if is_torch_available():
    import torch
-    from transformers import DPRConfig, DPRContextEncoder, DPRQuestionEncoder, DPRReader, DPRReaderTokenizer
+    from transformers import DPRContextEncoder, DPRQuestionEncoder, DPRReader, DPRReaderTokenizer
    from transformers.models.dpr.modeling_dpr import (
        DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
        DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -104,7 +104,12 @@ class DPRModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = DPRConfig(
+        config = self.get_config()
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def get_config(self):
        return DPRConfig(
            projection_dim=self.projection_dim,
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
@@ -119,8 +124,6 @@ class DPRModelTester:
            initializer_range=self.initializer_range,
        )
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def create_and_check_context_encoder(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
--- a/tests/test_modeling_electra.py
+++ b/tests/test_modeling_electra.py
@@ -16,7 +16,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import ElectraConfig, is_torch_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
@@ -29,7 +29,6 @@ if is_torch_available():
    from transformers import (
        MODEL_FOR_PRETRAINING_MAPPING,
        ElectraConfig,
        ElectraForMaskedLM,
        ElectraForMultipleChoice,
        ElectraForPreTraining,
@@ -89,7 +88,21 @@ class ElectraModelTester:
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
            fake_token_labels = ids_tensor([self.batch_size, self.seq_length], 1)
-        config = ElectraConfig(
+        config = self.get_config()
        return (
            config,
            input_ids,
            token_type_ids,
            input_mask,
            sequence_labels,
            token_labels,
            choice_labels,
            fake_token_labels,
        )
    def get_config(self):
        return ElectraConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -104,17 +117,6 @@ class ElectraModelTester:
            initializer_range=self.initializer_range,
        )
        return (
            config,
            input_ids,
            token_type_ids,
            input_mask,
            sequence_labels,
            token_labels,
            choice_labels,
            fake_token_labels,
        )
    def create_and_check_electra_model(
        self,
        config,
--- a/tests/test_modeling_flaubert.py
+++ b/tests/test_modeling_flaubert.py
@@ -13,10 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
-from transformers import is_torch_available
+from transformers import FlaubertConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -27,7 +26,6 @@ if is_torch_available():
    import torch
    from transformers import (
        FlaubertConfig,
        FlaubertForMultipleChoice,
        FlaubertForQuestionAnswering,
        FlaubertForQuestionAnsweringSimple,
@@ -96,7 +94,22 @@ class FlaubertModelTester(object):
            is_impossible_labels = ids_tensor([self.batch_size], 2).float()
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = FlaubertConfig(
+        config = self.get_config()
        return (
            config,
            input_ids,
            token_type_ids,
            input_lengths,
            sequence_labels,
            token_labels,
            is_impossible_labels,
            choice_labels,
            input_mask,
        )
    def get_config(self):
        return FlaubertConfig(
            vocab_size=self.vocab_size,
            n_special=self.n_special,
            emb_dim=self.hidden_size,
@@ -115,18 +128,6 @@ class FlaubertModelTester(object):
            use_proj=self.use_proj,
        )
        return (
            config,
            input_ids,
            token_type_ids,
            input_lengths,
            sequence_labels,
            token_labels,
            is_impossible_labels,
            choice_labels,
            input_mask,
        )
    def create_and_check_flaubert_model(
        self,
        config,
--- a/tests/test_modeling_fsmt.py
+++ b/tests/test_modeling_fsmt.py
@@ -19,7 +19,7 @@ import unittest
 import timeout_decorator  # noqa
 from parameterized import parameterized
-from transformers import is_torch_available
+from transformers import FSMTConfig, is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
@@ -32,7 +32,7 @@ if is_torch_available():
    import torch
    from torch import nn
-    from transformers import FSMTConfig, FSMTForConditionalGeneration, FSMTModel, FSMTTokenizer
+    from transformers import FSMTForConditionalGeneration, FSMTModel, FSMTTokenizer
    from transformers.models.fsmt.modeling_fsmt import (
        SinusoidalPositionalEmbedding,
        _prepare_fsmt_decoder_inputs,
@@ -42,8 +42,7 @@ if is_torch_available():
    from transformers.pipelines import TranslationPipeline
-@require_torch
+class FSMTModelTester:
 class ModelTester:
    def __init__(
        self,
        parent,
@@ -78,7 +77,12 @@ class ModelTester:
        )
        input_ids[:, -1] = 2  # Eos Token
-        config = FSMTConfig(
+        config = self.get_config()
        inputs_dict = prepare_fsmt_inputs_dict(config, input_ids)
        return config, inputs_dict
    def get_config(self):
        return FSMTConfig(
            vocab_size=self.src_vocab_size,  # hack needed for common tests
            src_vocab_size=self.src_vocab_size,
            tgt_vocab_size=self.tgt_vocab_size,
@@ -97,8 +101,6 @@ class ModelTester:
            bos_token_id=self.bos_token_id,
            pad_token_id=self.pad_token_id,
        )
        inputs_dict = prepare_fsmt_inputs_dict(config, input_ids)
        return config, inputs_dict
    def prepare_config_and_inputs_for_common(self):
        config, inputs_dict = self.prepare_config_and_inputs()
@@ -141,7 +143,7 @@ class FSMTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
    test_missing_keys = False
    def setUp(self):
-        self.model_tester = ModelTester(self)
+        self.model_tester = FSMTModelTester(self)
        self.langs = ["en", "ru"]
        config = {
            "langs": self.langs,
--- a/tests/test_modeling_funnel.py
+++ b/tests/test_modeling_funnel.py
@@ -16,7 +16,7 @@
 import unittest
-from transformers import FunnelTokenizer, is_torch_available
+from transformers import FunnelConfig, FunnelTokenizer, is_torch_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
@@ -30,7 +30,6 @@ if is_torch_available():
    from transformers import (
        MODEL_FOR_PRETRAINING_MAPPING,
        FunnelBaseModel,
        FunnelConfig,
        FunnelForMaskedLM,
        FunnelForMultipleChoice,
        FunnelForPreTraining,
@@ -127,7 +126,21 @@ class FunnelModelTester:
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
            fake_token_labels = ids_tensor([self.batch_size, self.seq_length], 1)
-        config = FunnelConfig(
+        config = self.get_config()
        return (
            config,
            input_ids,
            token_type_ids,
            input_mask,
            sequence_labels,
            token_labels,
            choice_labels,
            fake_token_labels,
        )
    def get_config(self):
        return FunnelConfig(
            vocab_size=self.vocab_size,
            block_sizes=self.block_sizes,
            num_decoder_layers=self.num_decoder_layers,
@@ -143,17 +156,6 @@ class FunnelModelTester:
            type_vocab_size=self.type_vocab_size,
        )
        return (
            config,
            input_ids,
            token_type_ids,
            input_mask,
            sequence_labels,
            token_labels,
            choice_labels,
            fake_token_labels,
        )
    def create_and_check_model(
        self,
        config,
--- a/tests/test_modeling_gpt2.py
+++ b/tests/test_modeling_gpt2.py
@@ -17,7 +17,7 @@
 import datetime
 import unittest
-from transformers import is_torch_available
+from transformers import GPT2Config, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -30,7 +30,6 @@ if is_torch_available():
    from transformers import (
        GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
        GPT2Config,
        GPT2DoubleHeadsModel,
        GPT2ForSequenceClassification,
        GPT2LMHeadModel,
@@ -119,25 +118,7 @@ class GPT2ModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = GPT2Config(
+        config = self.get_config(gradient_checkpointing=gradient_checkpointing)
            vocab_size=self.vocab_size,
            n_embd=self.hidden_size,
            n_layer=self.num_hidden_layers,
            n_head=self.num_attention_heads,
            # intermediate_size=self.intermediate_size,
            # hidden_act=self.hidden_act,
            # hidden_dropout_prob=self.hidden_dropout_prob,
            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            n_positions=self.max_position_embeddings,
            n_ctx=self.max_position_embeddings,
            # type_vocab_size=self.type_vocab_size,
            # initializer_range=self.initializer_range,
            use_cache=not gradient_checkpointing,
            bos_token_id=self.bos_token_id,
            eos_token_id=self.eos_token_id,
            pad_token_id=self.pad_token_id,
            gradient_checkpointing=gradient_checkpointing,
        )
        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
@@ -153,6 +134,27 @@ class GPT2ModelTester:
            choice_labels,
        )
    def get_config(self, gradient_checkpointing=False):
        return GPT2Config(
            vocab_size=self.vocab_size,
            n_embd=self.hidden_size,
            n_layer=self.num_hidden_layers,
            n_head=self.num_attention_heads,
            intermediate_size=self.intermediate_size,
            hidden_act=self.hidden_act,
            hidden_dropout_prob=self.hidden_dropout_prob,
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            n_positions=self.max_position_embeddings,
            n_ctx=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
            initializer_range=self.initializer_range,
            use_cache=not gradient_checkpointing,
            bos_token_id=self.bos_token_id,
            eos_token_id=self.eos_token_id,
            pad_token_id=self.pad_token_id,
            gradient_checkpointing=gradient_checkpointing,
        )
    def prepare_config_and_inputs_for_decoder(self):
        (
            config,
--- a/tests/test_modeling_gpt_neo.py
+++ b/tests/test_modeling_gpt_neo.py
@@ -17,7 +17,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import GPTNeoConfig, is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_torch, slow, torch_device
@@ -32,7 +32,6 @@ if is_torch_available():
    from transformers import (
        GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST,
        GPT2Tokenizer,
        GPTNeoConfig,
        GPTNeoForCausalLM,
        GPTNeoForSequenceClassification,
        GPTNeoModel,
@@ -123,20 +122,7 @@ class GPTNeoModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = GPTNeoConfig(
+        config = self.get_config(gradient_checkpointing=False)
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_layers=self.num_hidden_layers,
            num_heads=self.num_attention_heads,
            max_position_embeddings=self.max_position_embeddings,
            use_cache=not gradient_checkpointing,
            bos_token_id=self.bos_token_id,
            eos_token_id=self.eos_token_id,
            pad_token_id=self.pad_token_id,
            gradient_checkpointing=gradient_checkpointing,
            window_size=self.window_size,
            attention_types=self.attention_types,
        )
        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
@@ -152,6 +138,22 @@ class GPTNeoModelTester:
            choice_labels,
        )
    def get_config(self, gradient_checkpointing=False):
        return GPTNeoConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_layers=self.num_hidden_layers,
            num_heads=self.num_attention_heads,
            max_position_embeddings=self.max_position_embeddings,
            use_cache=not gradient_checkpointing,
            bos_token_id=self.bos_token_id,
            eos_token_id=self.eos_token_id,
            pad_token_id=self.pad_token_id,
            gradient_checkpointing=gradient_checkpointing,
            window_size=self.window_size,
            attention_types=self.attention_types,
        )
    def prepare_config_and_inputs_for_decoder(self):
        (
            config,
--- a/tests/test_modeling_hubert.py
+++ b/tests/test_modeling_hubert.py
@@ -21,7 +21,7 @@ import unittest
 import pytest
 from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
-from transformers import is_torch_available
+from transformers import HubertConfig, is_torch_available
 from transformers.testing_utils import require_datasets, require_soundfile, require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -31,7 +31,7 @@ from .test_modeling_common import ModelTesterMixin, _config_zero_init
 if is_torch_available():
    import torch
-    from transformers import HubertConfig, HubertForCTC, HubertModel, Wav2Vec2Processor
+    from transformers import HubertForCTC, HubertModel, Wav2Vec2Processor
    from transformers.models.hubert.modeling_hubert import _compute_mask_indices
@@ -98,7 +98,12 @@ class HubertModelTester:
        input_values = floats_tensor([self.batch_size, self.seq_length], self.vocab_size)
        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-        config = HubertConfig(
+        config = self.get_config()
        return config, input_values, attention_mask
    def get_config(self):
        return HubertConfig(
            hidden_size=self.hidden_size,
            feat_extract_norm=self.feat_extract_norm,
            feat_extract_dropout=self.feat_extract_dropout,
@@ -119,8 +124,6 @@ class HubertModelTester:
            vocab_size=self.vocab_size,
        )
        return config, input_values, attention_mask
    def create_and_check_model(self, config, input_values, attention_mask):
        model = HubertModel(config=config)
        model.to(torch_device)
--- a/tests/test_modeling_ibert.py
+++ b/tests/test_modeling_ibert.py
@@ -17,7 +17,7 @@
 import copy
 import unittest
-from transformers import is_torch_available
+from transformers import IBertConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -30,7 +30,6 @@ if is_torch_available():
    from transformers import (
        IBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
        IBertConfig,
        IBertForMaskedLM,
        IBertForMultipleChoice,
        IBertForQuestionAnswering,
@@ -97,7 +96,12 @@ class IBertModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = IBertConfig(
+        config = self.get_config()
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def get_config(self):
        return IBertConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -112,8 +116,6 @@ class IBertModelTester:
            quant_mode=True,
        )
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def create_and_check_model(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
--- a/tests/test_modeling_layoutlm.py
+++ b/tests/test_modeling_layoutlm.py
@@ -16,7 +16,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import LayoutLMConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -27,7 +27,6 @@ if is_torch_available():
    import torch
    from transformers import (
        LayoutLMConfig,
        LayoutLMForMaskedLM,
        LayoutLMForSequenceClassification,
        LayoutLMForTokenClassification,
@@ -120,7 +119,12 @@ class LayoutLMModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = LayoutLMConfig(
+        config = self.get_config()
        return config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def get_config(self):
        return LayoutLMConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -134,8 +138,6 @@ class LayoutLMModelTester:
            initializer_range=self.initializer_range,
        )
        return config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def create_and_check_model(
        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
--- a/tests/test_modeling_led.py
+++ b/tests/test_modeling_led.py
@@ -19,7 +19,7 @@ import copy
 import tempfile
 import unittest
-from transformers import is_torch_available
+from transformers import LEDConfig, is_torch_available
 from transformers.file_utils import cached_property
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
@@ -34,7 +34,6 @@ if is_torch_available():
    from transformers import (
        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
        LEDConfig,
        LEDForConditionalGeneration,
        LEDForQuestionAnswering,
        LEDForSequenceClassification,
@@ -75,7 +74,6 @@ def prepare_led_inputs_dict(
    }
@require_torch
 class LEDModelTester:
    def __init__(
        self,
@@ -141,7 +139,12 @@ class LEDModelTester:
        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        config = LEDConfig(
+        config = self.get_config()
        inputs_dict = prepare_led_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
    def get_config(self):
        return LEDConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
@@ -158,8 +161,6 @@ class LEDModelTester:
            pad_token_id=self.pad_token_id,
            attention_window=self.attention_window,
        )
        inputs_dict = prepare_led_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
    def prepare_config_and_inputs_for_common(self):
        config, inputs_dict = self.prepare_config_and_inputs()
--- a/tests/test_modeling_longformer.py
+++ b/tests/test_modeling_longformer.py
@@ -16,7 +16,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import LongformerConfig, is_torch_available
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -27,7 +27,6 @@ if is_torch_available():
    import torch
    from transformers import (
        LongformerConfig,
        LongformerForMaskedLM,
        LongformerForMultipleChoice,
        LongformerForQuestionAnswering,
@@ -100,7 +99,12 @@ class LongformerModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = LongformerConfig(
+        config = self.get_config()
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def get_config(self):
        return LongformerConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -115,8 +119,6 @@ class LongformerModelTester:
            attention_window=self.attention_window,
        )
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def create_and_check_attention_mask_determinism(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
--- a/tests/test_modeling_luke.py
+++ b/tests/test_modeling_luke.py
@@ -13,10 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Testing suite for the PyTorch LUKE model. """
 import unittest
-from transformers import is_torch_available
+from transformers import LukeConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -27,7 +26,6 @@ if is_torch_available():
    import torch
    from transformers import (
        LukeConfig,
        LukeForEntityClassification,
        LukeForEntityPairClassification,
        LukeForEntitySpanClassification,
@@ -154,7 +152,25 @@ class LukeModelTester:
                [self.batch_size, self.entity_length], self.num_entity_span_classification_labels
            )
-        config = LukeConfig(
+        config = self.get_config()
        return (
            config,
            input_ids,
            attention_mask,
            token_type_ids,
            entity_ids,
            entity_attention_mask,
            entity_token_type_ids,
            entity_position_ids,
            sequence_labels,
            entity_classification_labels,
            entity_pair_classification_labels,
            entity_span_classification_labels,
        )
    def get_config(self):
        return LukeConfig(
            vocab_size=self.vocab_size,
            entity_vocab_size=self.entity_vocab_size,
            entity_emb_size=self.entity_emb_size,
@@ -172,21 +188,6 @@ class LukeModelTester:
            use_entity_aware_attention=self.use_entity_aware_attention,
        )
        return (
            config,
            input_ids,
            attention_mask,
            token_type_ids,
            entity_ids,
            entity_attention_mask,
            entity_token_type_ids,
            entity_position_ids,
            sequence_labels,
            entity_classification_labels,
            entity_pair_classification_labels,
            entity_span_classification_labels,
        )
    def create_and_check_model(
        self,
        config,
--- a/tests/test_modeling_lxmert.py
+++ b/tests/test_modeling_lxmert.py
@@ -19,7 +19,7 @@ import unittest
 import numpy as np
-from transformers import is_torch_available
+from transformers import LxmertConfig, is_torch_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
@@ -33,7 +33,6 @@ if is_torch_available():
    from transformers import (
        MODEL_FOR_PRETRAINING_MAPPING,
        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
        LxmertConfig,
        LxmertForPreTraining,
        LxmertForQuestionAnswering,
        LxmertModel,
@@ -170,7 +169,24 @@ class LxmertModelTester:
        if self.task_matched:
            matched_label = ids_tensor([self.batch_size], self.num_labels)
-        config = LxmertConfig(
+        config = self.get_config()
        return (
            config,
            input_ids,
            visual_feats,
            bounding_boxes,
            token_type_ids,
            input_mask,
            obj_labels,
            masked_lm_labels,
            matched_label,
            ans,
            output_attentions,
        )
    def get_config(self):
        return LxmertConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_attention_heads=self.num_attention_heads,
@@ -204,20 +220,6 @@ class LxmertModelTester:
            output_hidden_states=self.output_hidden_states,
        )
        return (
            config,
            input_ids,
            visual_feats,
            bounding_boxes,
            token_type_ids,
            input_mask,
            obj_labels,
            masked_lm_labels,
            matched_label,
            ans,
            output_attentions,
        )
    def create_and_check_lxmert_model(
        self,
        config,
--- a/tests/test_modeling_m2m_100.py
+++ b/tests/test_modeling_m2m_100.py
@@ -19,7 +19,7 @@ import copy
 import tempfile
 import unittest
-from transformers import is_torch_available
+from transformers import M2M100Config, is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
@@ -31,7 +31,7 @@ from .test_modeling_common import ModelTesterMixin, ids_tensor
 if is_torch_available():
    import torch
-    from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Model, M2M100Tokenizer
+    from transformers import M2M100ForConditionalGeneration, M2M100Model, M2M100Tokenizer
    from transformers.models.m2m_100.modeling_m2m_100 import M2M100Decoder, M2M100Encoder
@@ -66,7 +66,6 @@ def prepare_m2m_100_inputs_dict(
    }
@require_torch
 class M2M100ModelTester:
    def __init__(
        self,
@@ -125,7 +124,12 @@ class M2M100ModelTester:
        input_ids = input_ids.clamp(self.pad_token_id + 1)
        decoder_input_ids = decoder_input_ids.clamp(self.pad_token_id + 1)
-        config = M2M100Config(
+        config = self.get_config()
        inputs_dict = prepare_m2m_100_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
    def get_config(self):
        return M2M100Config(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
@@ -143,8 +147,6 @@ class M2M100ModelTester:
            bos_token_id=self.bos_token_id,
            pad_token_id=self.pad_token_id,
        )
        inputs_dict = prepare_m2m_100_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
    def prepare_config_and_inputs_for_common(self):
        config, inputs_dict = self.prepare_config_and_inputs()
--- a/tests/test_modeling_marian.py
+++ b/tests/test_modeling_marian.py
@@ -17,7 +17,7 @@
 import tempfile
 import unittest
-from transformers import is_torch_available
+from transformers import MarianConfig, is_torch_available
 from transformers.file_utils import cached_property
 from transformers.hf_api import HfApi
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
@@ -34,7 +34,6 @@ if is_torch_available():
        AutoConfig,
        AutoModelWithLMHead,
        AutoTokenizer,
        MarianConfig,
        MarianModel,
        MarianMTModel,
        TranslationPipeline,
@@ -83,7 +82,6 @@ def prepare_marian_inputs_dict(
    }
@require_torch
 class MarianModelTester:
    def __init__(
        self,
@@ -126,7 +124,6 @@ class MarianModelTester:
        self.decoder_start_token_id = decoder_start_token_id
    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
            3,
        )
@@ -134,7 +131,12 @@ class MarianModelTester:
        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        config = MarianConfig(
+        config = self.get_config()
        inputs_dict = prepare_marian_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
    def get_config(self):
        return MarianConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
@@ -151,8 +153,6 @@ class MarianModelTester:
            pad_token_id=self.pad_token_id,
            decoder_start_token_id=self.decoder_start_token_id,
        )
        inputs_dict = prepare_marian_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
    def prepare_config_and_inputs_for_common(self):
        config, inputs_dict = self.prepare_config_and_inputs()
--- a/tests/test_modeling_mbart.py
+++ b/tests/test_modeling_mbart.py
@@ -19,7 +19,7 @@ import copy
 import tempfile
 import unittest
-from transformers import is_torch_available
+from transformers import MBartConfig, is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
@@ -34,7 +34,6 @@ if is_torch_available():
    from transformers import (
        AutoTokenizer,
        BatchEncoding,
        MBartConfig,
        MBartForCausalLM,
        MBartForConditionalGeneration,
        MBartForQuestionAnswering,
@@ -75,7 +74,6 @@ def prepare_mbart_inputs_dict(
    }
@require_torch
 class MBartModelTester:
    def __init__(
        self,
@@ -124,7 +122,12 @@ class MBartModelTester:
        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        config = MBartConfig(
+        config = self.get_config()
        inputs_dict = prepare_mbart_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
    def get_config(self):
        return MBartConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
@@ -140,8 +143,6 @@ class MBartModelTester:
            bos_token_id=self.bos_token_id,
            pad_token_id=self.pad_token_id,
        )
        inputs_dict = prepare_mbart_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
    def prepare_config_and_inputs_for_common(self):
        config, inputs_dict = self.prepare_config_and_inputs()
--- a/tests/test_modeling_megatron_bert.py
+++ b/tests/test_modeling_megatron_bert.py
@@ -19,7 +19,7 @@ import math
 import os
 import unittest
-from transformers import is_torch_available
+from transformers import MegatronBertConfig, is_torch_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
@@ -32,7 +32,6 @@ if is_torch_available():
    from transformers import (
        MODEL_FOR_PRETRAINING_MAPPING,
        MegatronBertConfig,
        MegatronBertForCausalLM,
        MegatronBertForMaskedLM,
        MegatronBertForMultipleChoice,
@@ -115,7 +114,12 @@ class MegatronBertModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = MegatronBertConfig(
+        config = self.get_config()
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def get_config(self):
        return MegatronBertConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -131,8 +135,6 @@ class MegatronBertModelTester:
            initializer_range=self.initializer_range,
        )
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def create_and_check_megatron_bert_model(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
--- a/tests/test_modeling_mobilebert.py
+++ b/tests/test_modeling_mobilebert.py
@@ -16,7 +16,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import MobileBertConfig, is_torch_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
@@ -29,7 +29,6 @@ if is_torch_available():
    from transformers import (
        MODEL_FOR_PRETRAINING_MAPPING,
        MobileBertConfig,
        MobileBertForMaskedLM,
        MobileBertForMultipleChoice,
        MobileBertForNextSentencePrediction,
@@ -111,7 +110,12 @@ class MobileBertModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = MobileBertConfig(
+        config = self.get_config()
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def get_config(self):
        return MobileBertConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -127,8 +131,6 @@ class MobileBertModelTester:
            initializer_range=self.initializer_range,
        )
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def create_and_check_mobilebert_model(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
--- a/tests/test_modeling_mpnet.py
+++ b/tests/test_modeling_mpnet.py
@@ -16,7 +16,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import MPNetConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -27,7 +27,6 @@ if is_torch_available():
    import torch
    from transformers import (
        MPNetConfig,
        MPNetForMaskedLM,
        MPNetForMultipleChoice,
        MPNetForQuestionAnswering,
@@ -104,7 +103,11 @@ class MPNetModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = MPNetConfig(
+        config = self.get_config()
        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
    def get_config(self):
        return MPNetConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -116,7 +119,6 @@ class MPNetModelTester:
            max_position_embeddings=self.max_position_embeddings,
            initializer_range=self.initializer_range,
        )
        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
    def create_and_check_mpnet_model(
        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
--- a/tests/test_modeling_pegasus.py
+++ b/tests/test_modeling_pegasus.py
@@ -17,7 +17,7 @@
 import tempfile
 import unittest
-from transformers import is_torch_available
+from transformers import PegasusConfig, is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
@@ -30,7 +30,7 @@ from .test_modeling_mbart import AbstractSeq2SeqIntegrationTest
 if is_torch_available():
    import torch
-    from transformers import AutoModelForSeq2SeqLM, PegasusConfig, PegasusForConditionalGeneration, PegasusModel
+    from transformers import AutoModelForSeq2SeqLM, PegasusForConditionalGeneration, PegasusModel
    from transformers.models.pegasus.modeling_pegasus import PegasusDecoder, PegasusEncoder, PegasusForCausalLM
@@ -65,7 +65,6 @@ def prepare_pegasus_inputs_dict(
    }
@require_torch
 class PegasusModelTester:
    def __init__(
        self,
@@ -114,7 +113,12 @@ class PegasusModelTester:
        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        config = PegasusConfig(
+        config = self.get_config()
        inputs_dict = prepare_pegasus_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
    def get_config(self):
        return PegasusConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
@@ -130,8 +134,6 @@ class PegasusModelTester:
            bos_token_id=self.bos_token_id,
            pad_token_id=self.pad_token_id,
        )
        inputs_dict = prepare_pegasus_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
    def prepare_config_and_inputs_for_common(self):
        config, inputs_dict = self.prepare_config_and_inputs()
--- a/tests/test_modeling_prophetnet.py
+++ b/tests/test_modeling_prophetnet.py
@@ -13,12 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
 import tempfile
 import unittest
-from transformers import is_torch_available
+from transformers import ProphetNetConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -30,7 +29,6 @@ if is_torch_available():
    import torch
    from transformers import (
        ProphetNetConfig,
        ProphetNetDecoder,
        ProphetNetEncoder,
        ProphetNetForCausalLM,
@@ -124,7 +122,19 @@ class ProphetNetModelTester:
        if self.use_labels:
            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-        config = ProphetNetConfig(
+        config = self.get_config()
        return (
            config,
            input_ids,
            decoder_input_ids,
            attention_mask,
            decoder_attention_mask,
            lm_labels,
        )
    def get_config(self):
        return ProphetNetConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_encoder_layers=self.num_encoder_layers,
@@ -145,15 +155,6 @@ class ProphetNetModelTester:
            is_encoder_decoder=self.is_encoder_decoder,
        )
        return (
            config,
            input_ids,
            decoder_input_ids,
            attention_mask,
            decoder_attention_mask,
            lm_labels,
        )
    def prepare_config_and_inputs_for_decoder(self):
        (
            config,
--- a/tests/test_modeling_reformer.py
+++ b/tests/test_modeling_reformer.py
@@ -15,7 +15,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import ReformerConfig, is_torch_available
 from transformers.testing_utils import (
    require_sentencepiece,
    require_tokenizers,
@@ -36,7 +36,6 @@ if is_torch_available():
    from transformers import (
        REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
        ReformerConfig,
        ReformerForMaskedLM,
        ReformerForQuestionAnswering,
        ReformerForSequenceClassification,
@@ -51,44 +50,44 @@ class ReformerModelTester:
    def __init__(
        self,
        parent,
-        batch_size=None,
+        batch_size=13,
-        seq_length=None,
+        seq_length=32,
-        is_training=None,
+        is_training=True,
-        is_decoder=None,
+        is_decoder=True,
-        use_input_mask=None,
+        use_input_mask=True,
-        use_labels=None,
+        use_labels=True,
-        vocab_size=None,
+        vocab_size=32,
-        attention_head_size=None,
+        attention_head_size=16,
-        hidden_size=None,
+        hidden_size=32,
-        num_attention_heads=None,
+        num_attention_heads=2,
-        local_attn_chunk_length=None,
+        local_attn_chunk_length=4,
-        local_num_chunks_before=None,
+        local_num_chunks_before=1,
-        local_num_chunks_after=None,
+        local_num_chunks_after=0,
        num_buckets=None,
        num_hashes=1,
        lsh_attn_chunk_length=None,
        lsh_num_chunks_before=None,
        lsh_num_chunks_after=None,
-        chunk_size_lm_head=None,
+        chunk_size_lm_head=0,
-        chunk_size_feed_forward=None,
+        chunk_size_feed_forward=0,
-        feed_forward_size=None,
+        feed_forward_size=32,
-        hidden_act=None,
+        hidden_act="gelu",
-        hidden_dropout_prob=None,
+        hidden_dropout_prob=0.1,
-        local_attention_probs_dropout_prob=None,
+        local_attention_probs_dropout_prob=0.1,
        lsh_attention_probs_dropout_prob=None,
-        max_position_embeddings=None,
+        max_position_embeddings=512,
-        initializer_range=None,
+        initializer_range=0.02,
-        axial_norm_std=None,
+        axial_norm_std=1.0,
-        layer_norm_eps=None,
+        layer_norm_eps=1e-12,
-        axial_pos_embds=None,
+        axial_pos_embds=True,
-        axial_pos_shape=None,
+        axial_pos_shape=[4, 8],
-        axial_pos_embds_dim=None,
+        axial_pos_embds_dim=[16, 16],
-        attn_layers=None,
+        attn_layers=["local", "local", "local", "local"],
-        pad_token_id=None,
+        pad_token_id=0,
-        eos_token_id=None,
+        eos_token_id=2,
        scope=None,
-        hash_seed=None,
+        hash_seed=0,
-        num_labels=None,
+        num_labels=2,
    ):
        self.parent = parent
        self.batch_size = batch_size
@@ -101,7 +100,7 @@ class ReformerModelTester:
        self.attention_head_size = attention_head_size
        self.hidden_size = hidden_size
        self.num_attention_heads = num_attention_heads
-        self.num_hidden_layers = len(attn_layers)
+        self.num_hidden_layers = len(attn_layers) if attn_layers is not None else 0
        self.local_attn_chunk_length = local_attn_chunk_length
        self.local_num_chunks_after = local_num_chunks_after
        self.local_num_chunks_before = local_num_chunks_before
@@ -149,7 +148,17 @@ class ReformerModelTester:
        if self.use_labels:
            choice_labels = ids_tensor([self.batch_size], 2)
-        config = ReformerConfig(
+        config = self.get_config()
        return (
            config,
            input_ids,
            input_mask,
            choice_labels,
        )
    def get_config(self):
        return ReformerConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -177,13 +186,6 @@ class ReformerModelTester:
            hash_seed=self.hash_seed,
        )
        return (
            config,
            input_ids,
            input_mask,
            choice_labels,
        )
    def create_and_check_reformer_model(self, config, input_ids, input_mask, choice_labels):
        model = ReformerModel(config=config)
        model.to(torch_device)
@@ -593,45 +595,8 @@ class ReformerLocalAttnModelTest(ReformerTesterMixin, GenerationTesterMixin, Mod
    test_torchscript = False
    test_sequence_classification_problem_types = True
    def prepare_kwargs(self):
        return {
            "batch_size": 13,
            "seq_length": 32,
            "is_training": True,
            "is_decoder": True,
            "use_input_mask": True,
            "use_labels": True,
            "vocab_size": 32,
            "attention_head_size": 16,
            "hidden_size": 32,
            "num_attention_heads": 2,
            "local_attn_chunk_length": 4,
            "local_num_chunks_before": 1,
            "local_num_chunks_after": 0,
            "chunk_size_lm_head": 0,
            "chunk_size_feed_forward": 0,
            "feed_forward_size": 32,
            "hidden_act": "gelu",
            "hidden_dropout_prob": 0.1,
            "local_attention_probs_dropout_prob": 0.1,
            "max_position_embeddings": 512,
            "initializer_range": 0.02,
            "axial_norm_std": 1.0,
            "layer_norm_eps": 1e-12,
            "axial_pos_embds": True,
            "axial_pos_shape": [4, 8],
            "axial_pos_embds_dim": [16, 16],
            "attn_layers": ["local", "local", "local", "local"],
            "pad_token_id": 0,
            "eos_token_id": 2,
            "scope": None,
            "hash_seed": 0,
            "num_labels": 2,
        }
    def setUp(self):
-        tester_kwargs = self.prepare_kwargs()
+        self.model_tester = ReformerModelTester(self)
        self.model_tester = ReformerModelTester(self, **tester_kwargs)
        self.config_tester = ConfigTester(self, config_class=ReformerConfig, hidden_size=37)
    @slow
@@ -716,49 +681,46 @@ class ReformerLSHAttnModelTest(ReformerTesterMixin, ModelTesterMixin, Generation
    test_headmasking = False
    test_torchscript = False
    def prepare_kwargs(self):
        return {
            "batch_size": 13,
            "seq_length": 13,
            "use_input_mask": True,
            "use_labels": True,
            "is_training": False,
            "is_decoder": True,
            "vocab_size": 32,
            "attention_head_size": 16,
            "hidden_size": 64,
            "num_attention_heads": 2,
            "num_buckets": 2,
            "num_hashes": 4,
            "lsh_attn_chunk_length": 4,
            "lsh_num_chunks_before": 1,
            "lsh_num_chunks_after": 0,
            "chunk_size_lm_head": 5,
            "chunk_size_feed_forward": 6,
            "feed_forward_size": 32,
            "hidden_act": "relu",
            "hidden_dropout_prob": 0.1,
            "lsh_attention_probs_dropout_prob": 0.1,
            "max_position_embeddings": 512,
            "initializer_range": 0.02,
            "axial_norm_std": 1.0,
            "layer_norm_eps": 1e-12,
            "axial_pos_embds": True,
            "axial_pos_shape": [4, 8],
            "axial_pos_embds_dim": [16, 48],
            #            sanotheu
            #            "attn_layers": ["lsh", "lsh", "lsh", "lsh"],
            "attn_layers": ["lsh"],
            "pad_token_id": 0,
            "eos_token_id": 2,
            "scope": None,
            "hash_seed": 0,
            "num_labels": 2,
        }
    def setUp(self):
-        tester_kwargs = self.prepare_kwargs()
+        self.model_tester = ReformerModelTester(
-        self.model_tester = ReformerModelTester(self, **tester_kwargs)
+            self,
            batch_size=13,
            seq_length=13,
            use_input_mask=True,
            use_labels=True,
            is_training=False,
            is_decoder=True,
            vocab_size=32,
            attention_head_size=16,
            hidden_size=64,
            num_attention_heads=2,
            num_buckets=2,
            num_hashes=4,
            lsh_attn_chunk_length=4,
            lsh_num_chunks_before=1,
            lsh_num_chunks_after=0,
            chunk_size_lm_head=5,
            chunk_size_feed_forward=6,
            feed_forward_size=32,
            hidden_act="relu",
            hidden_dropout_prob=0.1,
            lsh_attention_probs_dropout_prob=0.1,
            max_position_embeddings=512,
            initializer_range=0.02,
            axial_norm_std=1.0,
            layer_norm_eps=1e-12,
            axial_pos_embds=True,
            axial_pos_shape=[4, 8],
            axial_pos_embds_dim=[16, 48],
            # sanotheu
            # attn_layers=[lsh,lsh,lsh,lsh],
            attn_layers=["lsh"],
            pad_token_id=0,
            eos_token_id=2,
            scope=None,
            hash_seed=0,
            num_labels=2,
        )
        self.config_tester = ConfigTester(self, config_class=ReformerConfig, hidden_size=37)
    def _check_attentions_for_generate(
--- a/tests/test_modeling_roberta.py
+++ b/tests/test_modeling_roberta.py
@@ -17,7 +17,7 @@
 import unittest
 from copy import deepcopy
-from transformers import is_torch_available
+from transformers import RobertaConfig, is_torch_available
 from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -29,7 +29,6 @@ if is_torch_available():
    import torch
    from transformers import (
        RobertaConfig,
        RobertaForCausalLM,
        RobertaForMaskedLM,
        RobertaForMultipleChoice,
@@ -94,7 +93,12 @@ class RobertaModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = RobertaConfig(
+        config = self.get_config()
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def get_config(self):
        return RobertaConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -108,8 +112,6 @@ class RobertaModelTester:
            initializer_range=self.initializer_range,
        )
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def prepare_config_and_inputs_for_decoder(self):
        (
            config,
--- a/tests/test_modeling_roformer.py
+++ b/tests/test_modeling_roformer.py
@@ -18,7 +18,7 @@
 import unittest
 from tests.test_modeling_common import floats_tensor
-from transformers import is_torch_available
+from transformers import RoFormerConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -29,7 +29,6 @@ if is_torch_available():
    import torch
    from transformers import (
        RoFormerConfig,
        RoFormerForCausalLM,
        RoFormerForMaskedLM,
        RoFormerForMultipleChoice,
@@ -113,7 +112,12 @@ class RoFormerModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = RoFormerConfig(
+        config = self.get_config()
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def get_config(self):
        return RoFormerConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -128,8 +132,6 @@ class RoFormerModelTester:
            initializer_range=self.initializer_range,
        )
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def prepare_config_and_inputs_for_decoder(self):
        (
            config,
--- a/tests/test_modeling_speech_to_text.py
+++ b/tests/test_modeling_speech_to_text.py
@@ -14,13 +14,13 @@
 # limitations under the License.
 """ Testing suite for the PyTorch Speech2Text model. """
 import copy
 import inspect
 import os
 import tempfile
 import unittest
 from transformers import Speech2TextConfig
 from transformers.file_utils import cached_property
 from transformers.testing_utils import (
    is_torch_available,
@@ -40,12 +40,7 @@ from .test_modeling_common import ModelTesterMixin, _config_zero_init, floats_te
 if is_torch_available():
    import torch
-    from transformers import (
+    from transformers import Speech2TextForConditionalGeneration, Speech2TextModel, Speech2TextProcessor
        Speech2TextConfig,
        Speech2TextForConditionalGeneration,
        Speech2TextModel,
        Speech2TextProcessor,
    )
    from transformers.models.speech_to_text.modeling_speech_to_text import Speech2TextDecoder, Speech2TextEncoder
@@ -142,7 +137,17 @@ class Speech2TextModelTester:
        attention_mask = torch.ones([self.batch_size, self.seq_length], dtype=torch.long, device=torch_device)
        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(2)
-        config = Speech2TextConfig(
+        config = self.get_config()
        inputs_dict = prepare_speech_to_text_inputs_dict(
            config,
            input_features=input_features,
            decoder_input_ids=decoder_input_ids,
            attention_mask=attention_mask,
        )
        return config, inputs_dict
    def get_config(self):
        return Speech2TextConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
@@ -165,13 +170,6 @@ class Speech2TextModelTester:
            bos_token_id=self.bos_token_id,
            pad_token_id=self.pad_token_id,
        )
        inputs_dict = prepare_speech_to_text_inputs_dict(
            config,
            input_features=input_features,
            decoder_input_ids=decoder_input_ids,
            attention_mask=attention_mask,
        )
        return config, inputs_dict
    def prepare_config_and_inputs_for_common(self):
        config, inputs_dict = self.prepare_config_and_inputs()
--- a/tests/test_modeling_squeezebert.py
+++ b/tests/test_modeling_squeezebert.py
@@ -16,7 +16,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import SqueezeBertConfig, is_torch_available
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -28,7 +28,6 @@ if is_torch_available():
    from transformers import (
        SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
        SqueezeBertConfig,
        SqueezeBertForMaskedLM,
        SqueezeBertForMultipleChoice,
        SqueezeBertForQuestionAnswering,
@@ -37,179 +36,181 @@ if is_torch_available():
        SqueezeBertModel,
    )
    class SqueezeBertModelTester(object):
        def __init__(
            self,
            parent,
            batch_size=13,
            seq_length=7,
            is_training=True,
            use_input_mask=True,
            use_token_type_ids=False,
            use_labels=True,
            vocab_size=99,
            hidden_size=32,
            num_hidden_layers=5,
            num_attention_heads=4,
            intermediate_size=64,
            hidden_act="gelu",
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1,
            max_position_embeddings=512,
            type_vocab_size=16,
            type_sequence_label_size=2,
            initializer_range=0.02,
            num_labels=3,
            num_choices=4,
            scope=None,
            q_groups=2,
            k_groups=2,
            v_groups=2,
            post_attention_groups=2,
            intermediate_groups=4,
            output_groups=1,
        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
            self.is_training = is_training
            self.use_input_mask = use_input_mask
            self.use_token_type_ids = use_token_type_ids
            self.use_labels = use_labels
            self.vocab_size = vocab_size
            self.hidden_size = hidden_size
            self.num_hidden_layers = num_hidden_layers
            self.num_attention_heads = num_attention_heads
            self.intermediate_size = intermediate_size
            self.hidden_act = hidden_act
            self.hidden_dropout_prob = hidden_dropout_prob
            self.attention_probs_dropout_prob = attention_probs_dropout_prob
            self.max_position_embeddings = max_position_embeddings
            self.type_vocab_size = type_vocab_size
            self.type_sequence_label_size = type_sequence_label_size
            self.initializer_range = initializer_range
            self.num_labels = num_labels
            self.num_choices = num_choices
            self.scope = scope
            self.q_groups = q_groups
            self.k_groups = k_groups
            self.v_groups = v_groups
            self.post_attention_groups = post_attention_groups
            self.intermediate_groups = intermediate_groups
            self.output_groups = output_groups
-        def prepare_config_and_inputs(self):
+class SqueezeBertModelTester(object):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+    def __init__(
        self,
        parent,
        batch_size=13,
        seq_length=7,
        is_training=True,
        use_input_mask=True,
        use_token_type_ids=False,
        use_labels=True,
        vocab_size=99,
        hidden_size=32,
        num_hidden_layers=5,
        num_attention_heads=4,
        intermediate_size=64,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=16,
        type_sequence_label_size=2,
        initializer_range=0.02,
        num_labels=3,
        num_choices=4,
        scope=None,
        q_groups=2,
        k_groups=2,
        v_groups=2,
        post_attention_groups=2,
        intermediate_groups=4,
        output_groups=1,
    ):
        self.parent = parent
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.is_training = is_training
        self.use_input_mask = use_input_mask
        self.use_token_type_ids = use_token_type_ids
        self.use_labels = use_labels
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.type_sequence_label_size = type_sequence_label_size
        self.initializer_range = initializer_range
        self.num_labels = num_labels
        self.num_choices = num_choices
        self.scope = scope
        self.q_groups = q_groups
        self.k_groups = k_groups
        self.v_groups = v_groups
        self.post_attention_groups = post_attention_groups
        self.intermediate_groups = intermediate_groups
        self.output_groups = output_groups
-            input_mask = None
+    def prepare_config_and_inputs(self):
-            if self.use_input_mask:
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
                input_mask = random_attention_mask([self.batch_size, self.seq_length])
-            sequence_labels = None
+        input_mask = None
-            token_labels = None
+        if self.use_input_mask:
-            choice_labels = None
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
            if self.use_labels:
                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-            config = SqueezeBertConfig(
+        sequence_labels = None
-                embedding_size=self.hidden_size,
+        token_labels = None
-                vocab_size=self.vocab_size,
+        choice_labels = None
-                hidden_size=self.hidden_size,
+        if self.use_labels:
-                num_hidden_layers=self.num_hidden_layers,
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                num_attention_heads=self.num_attention_heads,
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                intermediate_size=self.intermediate_size,
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
                hidden_act=self.hidden_act,
                attention_probs_dropout_prob=self.hidden_dropout_prob,
                attention_dropout=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
                initializer_range=self.initializer_range,
                q_groups=self.q_groups,
                k_groups=self.k_groups,
                v_groups=self.v_groups,
                post_attention_groups=self.post_attention_groups,
                intermediate_groups=self.intermediate_groups,
                output_groups=self.output_groups,
            )
-            return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        config = self.get_config()
-        def create_and_check_squeezebert_model(
+        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
        ):
            model = SqueezeBertModel(config=config)
            model.to(torch_device)
            model.eval()
            result = model(input_ids, input_mask)
            result = model(input_ids)
            self.parent.assertEqual(
                result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)
            )
-        def create_and_check_squeezebert_for_masked_lm(
+    def get_config(self):
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        return SqueezeBertConfig(
-        ):
+            embedding_size=self.hidden_size,
-            model = SqueezeBertForMaskedLM(config=config)
+            vocab_size=self.vocab_size,
-            model.to(torch_device)
+            hidden_size=self.hidden_size,
-            model.eval()
+            num_hidden_layers=self.num_hidden_layers,
-            result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+            num_attention_heads=self.num_attention_heads,
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+            intermediate_size=self.intermediate_size,
            hidden_act=self.hidden_act,
            attention_probs_dropout_prob=self.hidden_dropout_prob,
            attention_dropout=self.attention_probs_dropout_prob,
            max_position_embeddings=self.max_position_embeddings,
            initializer_range=self.initializer_range,
            q_groups=self.q_groups,
            k_groups=self.k_groups,
            v_groups=self.v_groups,
            post_attention_groups=self.post_attention_groups,
            intermediate_groups=self.intermediate_groups,
            output_groups=self.output_groups,
        )
-        def create_and_check_squeezebert_for_question_answering(
+    def create_and_check_squeezebert_model(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
+    ):
-            model = SqueezeBertForQuestionAnswering(config=config)
+        model = SqueezeBertModel(config=config)
-            model.to(torch_device)
+        model.to(torch_device)
-            model.eval()
+        model.eval()
-            result = model(
+        result = model(input_ids, input_mask)
-                input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels
+        result = model(input_ids)
-            )
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
            self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
            self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-        def create_and_check_squeezebert_for_sequence_classification(
+    def create_and_check_squeezebert_for_masked_lm(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
+    ):
-            config.num_labels = self.num_labels
+        model = SqueezeBertForMaskedLM(config=config)
-            model = SqueezeBertForSequenceClassification(config)
+        model.to(torch_device)
-            model.to(torch_device)
+        model.eval()
-            model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-            result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-        def create_and_check_squeezebert_for_token_classification(
+    def create_and_check_squeezebert_for_question_answering(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
+    ):
-            config.num_labels = self.num_labels
+        model = SqueezeBertForQuestionAnswering(config=config)
-            model = SqueezeBertForTokenClassification(config=config)
+        model.to(torch_device)
-            model.to(torch_device)
+        model.eval()
-            model.eval()
+        result = model(
            input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels
        )
        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-            result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+    def create_and_check_squeezebert_for_sequence_classification(
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
        config.num_labels = self.num_labels
        model = SqueezeBertForSequenceClassification(config)
        model.to(torch_device)
        model.eval()
        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-        def create_and_check_squeezebert_for_multiple_choice(
+    def create_and_check_squeezebert_for_token_classification(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
+    ):
-            config.num_choices = self.num_choices
+        config.num_labels = self.num_labels
-            model = SqueezeBertForMultipleChoice(config=config)
+        model = SqueezeBertForTokenClassification(config=config)
-            model.to(torch_device)
+        model.to(torch_device)
-            model.eval()
+        model.eval()
            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
            result = model(
                multiple_choice_inputs_ids,
                attention_mask=multiple_choice_input_mask,
                labels=choice_labels,
            )
            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-        def prepare_config_and_inputs_for_common(self):
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-            config_and_inputs = self.prepare_config_and_inputs()
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-            (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
+
-            inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+    def create_and_check_squeezebert_for_multiple_choice(
-            return config, inputs_dict
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
        config.num_choices = self.num_choices
        model = SqueezeBertForMultipleChoice(config=config)
        model.to(torch_device)
        model.eval()
        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
        result = model(
            multiple_choice_inputs_ids,
            attention_mask=multiple_choice_input_mask,
            labels=choice_labels,
        )
        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
        return config, inputs_dict
@require_torch
--- a/tests/test_modeling_t5.py
+++ b/tests/test_modeling_t5.py
@@ -18,7 +18,7 @@ import copy
 import tempfile
 import unittest
-from transformers import is_torch_available
+from transformers import T5Config, is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
@@ -30,7 +30,7 @@ from .test_modeling_common import ModelTesterMixin, ids_tensor
 if is_torch_available():
    import torch
-    from transformers import ByT5Tokenizer, T5Config, T5EncoderModel, T5ForConditionalGeneration, T5Model, T5Tokenizer
+    from transformers import ByT5Tokenizer, T5EncoderModel, T5ForConditionalGeneration, T5Model, T5Tokenizer
    from transformers.models.t5.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_LIST
@@ -100,7 +100,19 @@ class T5ModelTester:
        if self.use_labels:
            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-        config = T5Config(
+        config = self.get_config()
        return (
            config,
            input_ids,
            decoder_input_ids,
            attention_mask,
            decoder_attention_mask,
            lm_labels,
        )
    def get_config(self):
        return T5Config(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            d_ff=self.d_ff,
@@ -117,15 +129,6 @@ class T5ModelTester:
            decoder_start_token_id=self.decoder_start_token_id,
        )
        return (
            config,
            input_ids,
            decoder_input_ids,
            attention_mask,
            decoder_attention_mask,
            lm_labels,
        )
    def check_prepare_lm_labels_via_shift_left(
        self,
        config,
--- a/tests/test_modeling_tapas.py
+++ b/tests/test_modeling_tapas.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
 import unittest
@@ -29,6 +28,7 @@ from transformers import (
    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
    MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
    TapasConfig,
    is_torch_available,
 )
 from transformers.file_utils import cached_property
@@ -43,7 +43,6 @@ if is_torch_available():
    import torch
    from transformers import (
        TapasConfig,
        TapasForMaskedLM,
        TapasForQuestionAnswering,
        TapasForSequenceClassification,
@@ -183,7 +182,24 @@ class TapasModelTester:
            float_answer = floats_tensor([self.batch_size]).to(torch_device)
            aggregation_labels = ids_tensor([self.batch_size], self.num_aggregation_labels).to(torch_device)
-        config = TapasConfig(
+        config = self.get_config()
        return (
            config,
            input_ids,
            input_mask,
            token_type_ids,
            sequence_labels,
            token_labels,
            labels,
            numeric_values,
            numeric_values_scale,
            float_answer,
            aggregation_labels,
        )
    def get_config(self):
        return TapasConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -220,20 +236,6 @@ class TapasModelTester:
            disable_per_token_loss=self.disable_per_token_loss,
        )
        return (
            config,
            input_ids,
            input_mask,
            token_type_ids,
            sequence_labels,
            token_labels,
            labels,
            numeric_values,
            numeric_values_scale,
            float_answer,
            aggregation_labels,
        )
    def create_and_check_model(
        self,
        config,
--- a/tests/test_modeling_transfo_xl.py
+++ b/tests/test_modeling_transfo_xl.py
@@ -17,7 +17,7 @@ import copy
 import random
 import unittest
-from transformers import is_torch_available
+from transformers import TransfoXLConfig, is_torch_available
 from transformers.testing_utils import require_torch, require_torch_multi_gpu, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -29,7 +29,7 @@ if is_torch_available():
    import torch
    from torch import nn
-    from transformers import TransfoXLConfig, TransfoXLForSequenceClassification, TransfoXLLMHeadModel, TransfoXLModel
+    from transformers import TransfoXLForSequenceClassification, TransfoXLLMHeadModel, TransfoXLModel
    from transformers.models.transfo_xl.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST
@@ -69,7 +69,12 @@ class TransfoXLModelTester:
        if self.use_labels:
            lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        config = TransfoXLConfig(
+        config = self.get_config()
        return (config, input_ids_1, input_ids_2, lm_labels)
    def get_config(self):
        return TransfoXLConfig(
            vocab_size=self.vocab_size,
            mem_len=self.mem_len,
            clamp_len=self.clamp_len,
@@ -85,8 +90,6 @@ class TransfoXLModelTester:
            pad_token_id=self.pad_token_id,
        )
        return (config, input_ids_1, input_ids_2, lm_labels)
    def set_seed(self):
        random.seed(self.seed)
        torch.manual_seed(self.seed)
--- a/tests/test_modeling_visual_bert.py
+++ b/tests/test_modeling_visual_bert.py
@@ -14,12 +14,11 @@
 # limitations under the License.
 """ Testing suite for the PyTorch VisualBERT model. """
 import copy
 import unittest
 from tests.test_modeling_common import floats_tensor
-from transformers import is_torch_available
+from transformers import VisualBertConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -30,7 +29,6 @@ if is_torch_available():
    import torch
    from transformers import (
        VisualBertConfig,
        VisualBertForMultipleChoice,
        VisualBertForPreTraining,
        VisualBertForQuestionAnswering,
@@ -98,7 +96,7 @@ class VisualBertModelTester:
        self.num_choices = num_choices
        self.scope = scope
-    def prepare_config(self):
+    def get_config(self):
        return VisualBertConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
@@ -138,7 +136,7 @@ class VisualBertModelTester:
        if self.use_visual_token_type_ids:
            visual_token_type_ids = ids_tensor([self.batch_size, self.visual_seq_length], self.type_vocab_size)
-        config = self.prepare_config()
+        config = self.get_config()
        return config, {
            "input_ids": input_ids,
            "token_type_ids": token_type_ids,
@@ -198,7 +196,7 @@ class VisualBertModelTester:
        if self.use_labels:
            labels = ids_tensor([self.batch_size], self.num_choices)
-        config = self.prepare_config()
+        config = self.get_config()
        return config, {
            "input_ids": input_ids,
            "token_type_ids": token_type_ids,
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -18,6 +18,7 @@
 import inspect
 import unittest
 from transformers import ViTConfig
 from transformers.file_utils import cached_property, is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
@@ -29,7 +30,7 @@ if is_torch_available():
    import torch
    from torch import nn
-    from transformers import ViTConfig, ViTForImageClassification, ViTModel
+    from transformers import ViTForImageClassification, ViTModel
    from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST, to_2tuple
@@ -86,7 +87,12 @@ class ViTModelTester:
        if self.use_labels:
            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-        config = ViTConfig(
+        config = self.get_config()
        return config, pixel_values, labels
    def get_config(self):
        return ViTConfig(
            image_size=self.image_size,
            patch_size=self.patch_size,
            num_channels=self.num_channels,
@@ -101,8 +107,6 @@ class ViTModelTester:
            initializer_range=self.initializer_range,
        )
        return config, pixel_values, labels
    def create_and_check_model(self, config, pixel_values, labels):
        model = ViTModel(config=config)
        model.to(torch_device)
--- a/tests/test_modeling_wav2vec2.py
+++ b/tests/test_modeling_wav2vec2.py
@@ -21,7 +21,7 @@ import unittest
 import pytest
 from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
-from transformers import is_torch_available
+from transformers import Wav2Vec2Config, is_torch_available
 from transformers.testing_utils import require_datasets, require_soundfile, require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -32,7 +32,6 @@ if is_torch_available():
    import torch
    from transformers import (
        Wav2Vec2Config,
        Wav2Vec2FeatureExtractor,
        Wav2Vec2ForCTC,
        Wav2Vec2ForMaskedLM,
@@ -106,7 +105,12 @@ class Wav2Vec2ModelTester:
        input_values = floats_tensor([self.batch_size, self.seq_length], self.vocab_size)
        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-        config = Wav2Vec2Config(
+        config = self.get_config()
        return config, input_values, attention_mask
    def get_config(self):
        return Wav2Vec2Config(
            hidden_size=self.hidden_size,
            feat_extract_norm=self.feat_extract_norm,
            feat_extract_dropout=self.feat_extract_dropout,
@@ -127,8 +131,6 @@ class Wav2Vec2ModelTester:
            vocab_size=self.vocab_size,
        )
        return config, input_values, attention_mask
    def create_and_check_model(self, config, input_values, attention_mask):
        model = Wav2Vec2Model(config=config)
        model.to(torch_device)
--- a/tests/test_modeling_xlm.py
+++ b/tests/test_modeling_xlm.py
@@ -13,10 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
-from transformers import is_torch_available
+from transformers import XLMConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -28,7 +27,6 @@ if is_torch_available():
    import torch
    from transformers import (
        XLMConfig,
        XLMForMultipleChoice,
        XLMForQuestionAnswering,
        XLMForQuestionAnsweringSimple,
@@ -97,7 +95,22 @@ class XLMModelTester:
            is_impossible_labels = ids_tensor([self.batch_size], 2).float()
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = XLMConfig(
+        config = self.get_config()
        return (
            config,
            input_ids,
            token_type_ids,
            input_lengths,
            sequence_labels,
            token_labels,
            is_impossible_labels,
            choice_labels,
            input_mask,
        )
    def get_config(self):
        return XLMConfig(
            vocab_size=self.vocab_size,
            n_special=self.n_special,
            emb_dim=self.hidden_size,
@@ -118,18 +131,6 @@ class XLMModelTester:
            bos_token_id=self.bos_token_id,
        )
        return (
            config,
            input_ids,
            token_type_ids,
            input_lengths,
            sequence_labels,
            token_labels,
            is_impossible_labels,
            choice_labels,
            input_mask,
        )
    def create_and_check_xlm_model(
        self,
        config,
--- a/tests/test_modeling_xlnet.py
+++ b/tests/test_modeling_xlnet.py
@@ -13,11 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import random
 import unittest
-from transformers import is_torch_available
+from transformers import XLNetConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -29,7 +28,6 @@ if is_torch_available():
    import torch
    from transformers import (
        XLNetConfig,
        XLNetForMultipleChoice,
        XLNetForQuestionAnswering,
        XLNetForQuestionAnsweringSimple,
@@ -131,7 +129,25 @@ class XLNetModelTester:
            is_impossible_labels = ids_tensor([self.batch_size], 2).float()
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-        config = XLNetConfig(
+        config = self.get_config()
        return (
            config,
            input_ids_1,
            input_ids_2,
            input_ids_q,
            perm_mask,
            input_mask,
            target_mapping,
            segment_ids,
            lm_labels,
            sequence_labels,
            is_impossible_labels,
            token_labels,
        )
    def get_config(self):
        return XLNetConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            n_head=self.num_attention_heads,
@@ -150,21 +166,6 @@ class XLNetModelTester:
            eos_token_id=self.eos_token_id,
        )
        return (
            config,
            input_ids_1,
            input_ids_2,
            input_ids_q,
            perm_mask,
            input_mask,
            target_mapping,
            segment_ids,
            lm_labels,
            sequence_labels,
            is_impossible_labels,
            token_labels,
        )
    def set_seed(self):
        random.seed(self.seed)
        torch.manual_seed(self.seed)