diff --git a/docs/source/en/model_doc/bort.md b/docs/source/en/model_doc/bort.md
index 23f004cc9b..dccf2b560b 100644
--- a/docs/source/en/model_doc/bort.md
+++ b/docs/source/en/model_doc/bort.md
@@ -16,6 +16,15 @@ rendered properly in your Markdown viewer.
 
 # BORT
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
+You can do so by running the following command: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
 ## Overview
 
 The BORT model was proposed in [Optimal Subarchitecture Extraction for BERT](https://arxiv.org/abs/2010.10499) by
diff --git a/docs/source/en/model_doc/mctct.md b/docs/source/en/model_doc/mctct.md
index 1fbe78dca6..72d4bedfac 100644
--- a/docs/source/en/model_doc/mctct.md
+++ b/docs/source/en/model_doc/mctct.md
@@ -16,6 +16,15 @@ rendered properly in your Markdown viewer.
 
 # M-CTC-T
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
+You can do so by running the following command: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
 ## Overview
 
 The M-CTC-T model was proposed in [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert. The model is a 1B-param transformer encoder, with a CTC head over 8065 character labels and a language identification head over 60 language ID labels. It is trained on Common Voice (version 6.1, December 2020 release) and VoxPopuli. After training on Common Voice and VoxPopuli, the model is trained on Common Voice only. The labels are unnormalized character-level transcripts (punctuation and capitalization are not removed). The model takes as input Mel filterbank features from a 16Khz audio signal.
diff --git a/docs/source/en/model_doc/retribert.md b/docs/source/en/model_doc/retribert.md
index d982cf4a0b..ab29ac966f 100644
--- a/docs/source/en/model_doc/retribert.md
+++ b/docs/source/en/model_doc/retribert.md
@@ -16,6 +16,15 @@ rendered properly in your Markdown viewer.
 
 # RetriBERT
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
+You can do so by running the following command: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
 ## Overview
 
 The RetriBERT model was proposed in the blog post [Explain Anything Like I'm Five: A Model for Open Domain Long Form
diff --git a/docs/source/en/model_doc/tapex.md b/docs/source/en/model_doc/tapex.md
index 8cebceeb73..52234b5c59 100644
--- a/docs/source/en/model_doc/tapex.md
+++ b/docs/source/en/model_doc/tapex.md
@@ -16,6 +16,15 @@ rendered properly in your Markdown viewer.
 
 # TAPEX
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
+You can do so by running the following command: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
 ## Overview
 
 The TAPEX model was proposed in [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu,
diff --git a/docs/source/en/model_doc/trajectory_transformer.md b/docs/source/en/model_doc/trajectory_transformer.md
index 25b24e3db6..548642f7bb 100644
--- a/docs/source/en/model_doc/trajectory_transformer.md
+++ b/docs/source/en/model_doc/trajectory_transformer.md
@@ -16,6 +16,15 @@ rendered properly in your Markdown viewer.
 
 # Trajectory Transformer
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
+You can do so by running the following command: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
 ## Overview
 
 The Trajectory Transformer model was proposed in [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039)  by Michael Janner, Qiyang Li, Sergey Levine.
diff --git a/docs/source/en/model_doc/van.md b/docs/source/en/model_doc/van.md
index c6d7653363..b9539602d3 100644
--- a/docs/source/en/model_doc/van.md
+++ b/docs/source/en/model_doc/van.md
@@ -16,6 +16,15 @@ rendered properly in your Markdown viewer.
 
 # VAN
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
+You can do so by running the following command: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
 ## Overview
 
 The VAN model was proposed in [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 19789d9924..872f2ddaad 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -202,7 +202,6 @@ _import_structure = {
         "Blip2VisionConfig",
     ],
     "models.bloom": ["BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP", "BloomConfig"],
-    "models.bort": [],
     "models.bridgetower": [
         "BRIDGETOWER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "BridgeTowerConfig",
@@ -263,6 +262,26 @@ _import_structure = {
     "models.decision_transformer": ["DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "DecisionTransformerConfig"],
     "models.deformable_detr": ["DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeformableDetrConfig"],
     "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"],
+    "models.deprecated": [],
+    "models.deprecated.bort": [],
+    "models.deprecated.mctct": [
+        "MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "MCTCTConfig",
+        "MCTCTFeatureExtractor",
+        "MCTCTProcessor",
+    ],
+    "models.deprecated.mmbt": ["MMBTConfig"],
+    "models.deprecated.retribert": [
+        "RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "RetriBertConfig",
+        "RetriBertTokenizer",
+    ],
+    "models.deprecated.tapex": ["TapexTokenizer"],
+    "models.deprecated.trajectory_transformer": [
+        "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "TrajectoryTransformerConfig",
+    ],
+    "models.deprecated.van": ["VAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "VanConfig"],
     "models.deta": ["DETA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetaConfig"],
     "models.detr": ["DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetrConfig"],
     "models.dialogpt": [],
@@ -390,13 +409,11 @@ _import_structure = {
     "models.maskformer": ["MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "MaskFormerConfig", "MaskFormerSwinConfig"],
     "models.mbart": ["MBartConfig"],
     "models.mbart50": [],
-    "models.mctct": ["MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MCTCTConfig", "MCTCTFeatureExtractor", "MCTCTProcessor"],
     "models.mega": ["MEGA_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegaConfig"],
     "models.megatron_bert": ["MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegatronBertConfig"],
     "models.megatron_gpt2": [],
     "models.mgp_str": ["MGP_STR_PRETRAINED_CONFIG_ARCHIVE_MAP", "MgpstrConfig", "MgpstrProcessor", "MgpstrTokenizer"],
     "models.mluke": [],
-    "models.mmbt": ["MMBTConfig"],
     "models.mobilebert": ["MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileBertConfig", "MobileBertTokenizer"],
     "models.mobilenet_v1": ["MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileNetV1Config"],
     "models.mobilenet_v2": ["MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileNetV2Config"],
@@ -451,7 +468,6 @@ _import_structure = {
     "models.regnet": ["REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "RegNetConfig"],
     "models.rembert": ["REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RemBertConfig"],
     "models.resnet": ["RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ResNetConfig"],
-    "models.retribert": ["RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RetriBertConfig", "RetriBertTokenizer"],
     "models.roberta": ["ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaConfig", "RobertaTokenizer"],
     "models.roberta_prelayernorm": ["ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaPreLayerNormConfig"],
     "models.roc_bert": ["ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoCBertConfig", "RoCBertTokenizer"],
@@ -498,17 +514,12 @@ _import_structure = {
     "models.t5": ["T5_PRETRAINED_CONFIG_ARCHIVE_MAP", "T5Config"],
     "models.table_transformer": ["TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TableTransformerConfig"],
     "models.tapas": ["TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP", "TapasConfig", "TapasTokenizer"],
-    "models.tapex": ["TapexTokenizer"],
     "models.time_series_transformer": [
         "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TimeSeriesTransformerConfig",
     ],
     "models.timesformer": ["TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TimesformerConfig"],
     "models.timm_backbone": ["TimmBackboneConfig"],
-    "models.trajectory_transformer": [
-        "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "TrajectoryTransformerConfig",
-    ],
     "models.transfo_xl": [
         "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TransfoXLConfig",
@@ -536,7 +547,6 @@ _import_structure = {
         "UniSpeechSatConfig",
     ],
     "models.upernet": ["UperNetConfig"],
-    "models.van": ["VAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "VanConfig"],
     "models.videomae": ["VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "VideoMAEConfig"],
     "models.vilt": [
         "VILT_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -783,6 +793,7 @@ else:
     _import_structure["models.cpm"].append("CpmTokenizerFast")
     _import_structure["models.deberta"].append("DebertaTokenizerFast")
     _import_structure["models.deberta_v2"].append("DebertaV2TokenizerFast")
+    _import_structure["models.deprecated.retribert"].append("RetriBertTokenizerFast")
     _import_structure["models.distilbert"].append("DistilBertTokenizerFast")
     _import_structure["models.dpr"].extend(
         ["DPRContextEncoderTokenizerFast", "DPRQuestionEncoderTokenizerFast", "DPRReaderTokenizerFast"]
@@ -815,7 +826,6 @@ else:
     _import_structure["models.realm"].append("RealmTokenizerFast")
     _import_structure["models.reformer"].append("ReformerTokenizerFast")
     _import_structure["models.rembert"].append("RemBertTokenizerFast")
-    _import_structure["models.retribert"].append("RetriBertTokenizerFast")
     _import_structure["models.roberta"].append("RobertaTokenizerFast")
     _import_structure["models.roformer"].append("RoFormerTokenizerFast")
     _import_structure["models.splinter"].append("SplinterTokenizerFast")
@@ -1497,6 +1507,33 @@ else:
             "DeiTPreTrainedModel",
         ]
     )
+    _import_structure["models.deprecated.mctct"].extend(
+        [
+            "MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "MCTCTForCTC",
+            "MCTCTModel",
+            "MCTCTPreTrainedModel",
+        ]
+    )
+    _import_structure["models.deprecated.mmbt"].extend(["MMBTForClassification", "MMBTModel", "ModalEmbeddings"])
+    _import_structure["models.deprecated.retribert"].extend(
+        ["RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST", "RetriBertModel", "RetriBertPreTrainedModel"]
+    )
+    _import_structure["models.deprecated.trajectory_transformer"].extend(
+        [
+            "TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TrajectoryTransformerModel",
+            "TrajectoryTransformerPreTrainedModel",
+        ]
+    )
+    _import_structure["models.deprecated.van"].extend(
+        [
+            "VAN_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "VanForImageClassification",
+            "VanModel",
+            "VanPreTrainedModel",
+        ]
+    )
     _import_structure["models.deta"].extend(
         [
             "DETA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2043,14 +2080,6 @@ else:
             "MBartPreTrainedModel",
         ]
     )
-    _import_structure["models.mctct"].extend(
-        [
-            "MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "MCTCTForCTC",
-            "MCTCTModel",
-            "MCTCTPreTrainedModel",
-        ]
-    )
     _import_structure["models.mega"].extend(
         [
             "MEGA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2087,7 +2116,6 @@ else:
             "MgpstrPreTrainedModel",
         ]
     )
-    _import_structure["models.mmbt"].extend(["MMBTForClassification", "MMBTModel", "ModalEmbeddings"])
     _import_structure["models.mobilebert"].extend(
         [
             "MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2419,9 +2447,6 @@ else:
             "ResNetPreTrainedModel",
         ]
     )
-    _import_structure["models.retribert"].extend(
-        ["RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST", "RetriBertModel", "RetriBertPreTrainedModel"]
-    )
     _import_structure["models.roberta"].extend(
         [
             "ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2660,13 +2685,6 @@ else:
         ]
     )
     _import_structure["models.timm_backbone"].extend(["TimmBackbone"])
-    _import_structure["models.trajectory_transformer"].extend(
-        [
-            "TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "TrajectoryTransformerModel",
-            "TrajectoryTransformerPreTrainedModel",
-        ]
-    )
     _import_structure["models.transfo_xl"].extend(
         [
             "TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2727,14 +2745,6 @@ else:
             "UperNetPreTrainedModel",
         ]
     )
-    _import_structure["models.van"].extend(
-        [
-            "VAN_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "VanForImageClassification",
-            "VanModel",
-            "VanPreTrainedModel",
-        ]
-    )
     _import_structure["models.videomae"].extend(
         [
             "VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4187,6 +4197,24 @@ if TYPE_CHECKING:
     )
     from .models.deformable_detr import DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DeformableDetrConfig
     from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
+    from .models.deprecated.mctct import (
+        MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MCTCTConfig,
+        MCTCTFeatureExtractor,
+        MCTCTProcessor,
+    )
+    from .models.deprecated.mmbt import MMBTConfig
+    from .models.deprecated.retribert import (
+        RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        RetriBertConfig,
+        RetriBertTokenizer,
+    )
+    from .models.deprecated.tapex import TapexTokenizer
+    from .models.deprecated.trajectory_transformer import (
+        TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TrajectoryTransformerConfig,
+    )
+    from .models.deprecated.van import VAN_PRETRAINED_CONFIG_ARCHIVE_MAP, VanConfig
     from .models.deta import DETA_PRETRAINED_CONFIG_ARCHIVE_MAP, DetaConfig
     from .models.detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig
     from .models.dinat import DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP, DinatConfig
@@ -4304,11 +4332,9 @@ if TYPE_CHECKING:
     from .models.mask2former import MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, Mask2FormerConfig
     from .models.maskformer import MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, MaskFormerConfig, MaskFormerSwinConfig
     from .models.mbart import MBartConfig
-    from .models.mctct import MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP, MCTCTConfig, MCTCTFeatureExtractor, MCTCTProcessor
     from .models.mega import MEGA_PRETRAINED_CONFIG_ARCHIVE_MAP, MegaConfig
     from .models.megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
     from .models.mgp_str import MGP_STR_PRETRAINED_CONFIG_ARCHIVE_MAP, MgpstrConfig, MgpstrProcessor, MgpstrTokenizer
-    from .models.mmbt import MMBTConfig
     from .models.mobilebert import MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileBertConfig, MobileBertTokenizer
     from .models.mobilenet_v1 import MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileNetV1Config
     from .models.mobilenet_v2 import MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileNetV2Config
@@ -4359,7 +4385,6 @@ if TYPE_CHECKING:
     from .models.regnet import REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP, RegNetConfig
     from .models.rembert import REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RemBertConfig
     from .models.resnet import RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ResNetConfig
-    from .models.retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig, RetriBertTokenizer
     from .models.roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, RobertaTokenizer
     from .models.roberta_prelayernorm import (
         ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -4409,17 +4434,12 @@ if TYPE_CHECKING:
     from .models.t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
     from .models.table_transformer import TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TableTransformerConfig
     from .models.tapas import TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP, TapasConfig, TapasTokenizer
-    from .models.tapex import TapexTokenizer
     from .models.time_series_transformer import (
         TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
         TimeSeriesTransformerConfig,
     )
     from .models.timesformer import TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TimesformerConfig
     from .models.timm_backbone import TimmBackboneConfig
-    from .models.trajectory_transformer import (
-        TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        TrajectoryTransformerConfig,
-    )
     from .models.transfo_xl import (
         TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
         TransfoXLConfig,
@@ -4432,7 +4452,6 @@ if TYPE_CHECKING:
     from .models.unispeech import UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechConfig
     from .models.unispeech_sat import UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechSatConfig
     from .models.upernet import UperNetConfig
-    from .models.van import VAN_PRETRAINED_CONFIG_ARCHIVE_MAP, VanConfig
     from .models.videomae import VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP, VideoMAEConfig
     from .models.vilt import (
         VILT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -4667,6 +4686,7 @@ if TYPE_CHECKING:
         from .models.cpm import CpmTokenizerFast
         from .models.deberta import DebertaTokenizerFast
         from .models.deberta_v2 import DebertaV2TokenizerFast
+        from .models.deprecated.retribert import RetriBertTokenizerFast
         from .models.distilbert import DistilBertTokenizerFast
         from .models.dpr import DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizerFast, DPRReaderTokenizerFast
         from .models.electra import ElectraTokenizerFast
@@ -4697,7 +4717,6 @@ if TYPE_CHECKING:
         from .models.realm import RealmTokenizerFast
         from .models.reformer import ReformerTokenizerFast
         from .models.rembert import RemBertTokenizerFast
-        from .models.retribert import RetriBertTokenizerFast
         from .models.roberta import RobertaTokenizerFast
         from .models.roformer import RoFormerTokenizerFast
         from .models.splinter import SplinterTokenizerFast
@@ -5262,6 +5281,29 @@ if TYPE_CHECKING:
             DeiTModel,
             DeiTPreTrainedModel,
         )
+        from .models.deprecated.mctct import (
+            MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MCTCTForCTC,
+            MCTCTModel,
+            MCTCTPreTrainedModel,
+        )
+        from .models.deprecated.mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
+        from .models.deprecated.retribert import (
+            RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            RetriBertModel,
+            RetriBertPreTrainedModel,
+        )
+        from .models.deprecated.trajectory_transformer import (
+            TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TrajectoryTransformerModel,
+            TrajectoryTransformerPreTrainedModel,
+        )
+        from .models.deprecated.van import (
+            VAN_PRETRAINED_MODEL_ARCHIVE_LIST,
+            VanForImageClassification,
+            VanModel,
+            VanPreTrainedModel,
+        )
         from .models.deta import (
             DETA_PRETRAINED_MODEL_ARCHIVE_LIST,
             DetaForObjectDetection,
@@ -5698,7 +5740,6 @@ if TYPE_CHECKING:
             MBartModel,
             MBartPreTrainedModel,
         )
-        from .models.mctct import MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST, MCTCTForCTC, MCTCTModel, MCTCTPreTrainedModel
         from .models.mega import (
             MEGA_PRETRAINED_MODEL_ARCHIVE_LIST,
             MegaForCausalLM,
@@ -5729,7 +5770,6 @@ if TYPE_CHECKING:
             MgpstrModel,
             MgpstrPreTrainedModel,
         )
-        from .models.mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
         from .models.mobilebert import (
             MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             MobileBertForMaskedLM,
@@ -6011,7 +6051,6 @@ if TYPE_CHECKING:
             ResNetModel,
             ResNetPreTrainedModel,
         )
-        from .models.retribert import RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST, RetriBertModel, RetriBertPreTrainedModel
         from .models.roberta import (
             ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
             RobertaForCausalLM,
@@ -6204,11 +6243,6 @@ if TYPE_CHECKING:
             TimesformerPreTrainedModel,
         )
         from .models.timm_backbone import TimmBackbone
-        from .models.trajectory_transformer import (
-            TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-            TrajectoryTransformerModel,
-            TrajectoryTransformerPreTrainedModel,
-        )
         from .models.transfo_xl import (
             TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
             AdaptiveEmbedding,
@@ -6252,12 +6286,6 @@ if TYPE_CHECKING:
             UniSpeechSatPreTrainedModel,
         )
         from .models.upernet import UperNetForSemanticSegmentation, UperNetPreTrainedModel
-        from .models.van import (
-            VAN_PRETRAINED_MODEL_ARCHIVE_LIST,
-            VanForImageClassification,
-            VanModel,
-            VanPreTrainedModel,
-        )
         from .models.videomae import (
             VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST,
             VideoMAEForPreTraining,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index b235f9ba35..7f698f15d9 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -36,7 +36,6 @@ from . import (
     blip,
     blip_2,
     bloom,
-    bort,
     bridgetower,
     byt5,
     camembert,
@@ -60,6 +59,7 @@ from . import (
     decision_transformer,
     deformable_detr,
     deit,
+    deprecated,
     deta,
     detr,
     dialogpt,
@@ -122,13 +122,11 @@ from . import (
     maskformer,
     mbart,
     mbart50,
-    mctct,
     mega,
     megatron_bert,
     megatron_gpt2,
     mgp_str,
     mluke,
-    mmbt,
     mobilebert,
     mobilenet_v1,
     mobilenet_v2,
@@ -164,7 +162,6 @@ from . import (
     regnet,
     rembert,
     resnet,
-    retribert,
     roberta,
     roberta_prelayernorm,
     roc_bert,
@@ -188,11 +185,9 @@ from . import (
     t5,
     table_transformer,
     tapas,
-    tapex,
     time_series_transformer,
     timesformer,
     timm_backbone,
-    trajectory_transformer,
     transfo_xl,
     trocr,
     tvlt,
@@ -200,7 +195,6 @@ from . import (
     unispeech,
     unispeech_sat,
     upernet,
-    van,
     videomae,
     vilt,
     vision_encoder_decoder,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 0cc8f93dea..25f9ccc65e 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -640,6 +640,15 @@ MODEL_NAMES_MAPPING = OrderedDict(
     ]
 )
 
+DEPRECATED_MODELS = [
+    "bort",
+    "mctct",
+    "mmbt",
+    "retribert",
+    "trajectory_transformer",
+    "van",
+]
+
 SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict(
     [
         ("openai-gpt", "openai"),
@@ -659,7 +668,11 @@ def model_type_to_module_name(key):
     if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME:
         return SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key]
 
-    return key.replace("-", "_")
+    key = key.replace("-", "_")
+    if key in DEPRECATED_MODELS:
+        key = f"deprecated.{key}"
+
+    return key
 
 
 def config_class_to_model_type(config):
diff --git a/src/transformers/models/bort/__init__.py b/src/transformers/models/deprecated/__init__.py
similarity index 100%
rename from src/transformers/models/bort/__init__.py
rename to src/transformers/models/deprecated/__init__.py
diff --git a/tests/models/bort/__init__.py b/src/transformers/models/deprecated/bort/__init__.py
similarity index 100%
rename from tests/models/bort/__init__.py
rename to src/transformers/models/deprecated/bort/__init__.py
diff --git a/src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
similarity index 100%
rename from src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
rename to src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
diff --git a/src/transformers/models/mctct/__init__.py b/src/transformers/models/deprecated/mctct/__init__.py
similarity index 95%
rename from src/transformers/models/mctct/__init__.py
rename to src/transformers/models/deprecated/mctct/__init__.py
index f389ebd9e7..567be97b7c 100644
--- a/src/transformers/models/mctct/__init__.py
+++ b/src/transformers/models/deprecated/mctct/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/mctct/configuration_mctct.py b/src/transformers/models/deprecated/mctct/configuration_mctct.py
similarity index 99%
rename from src/transformers/models/mctct/configuration_mctct.py
rename to src/transformers/models/deprecated/mctct/configuration_mctct.py
index 6389f2238f..4797a77d29 100644
--- a/src/transformers/models/mctct/configuration_mctct.py
+++ b/src/transformers/models/deprecated/mctct/configuration_mctct.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """M-CTC-T model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/mctct/feature_extraction_mctct.py b/src/transformers/models/deprecated/mctct/feature_extraction_mctct.py
similarity index 97%
rename from src/transformers/models/mctct/feature_extraction_mctct.py
rename to src/transformers/models/deprecated/mctct/feature_extraction_mctct.py
index 23ae02ecad..e1e17c4b12 100644
--- a/src/transformers/models/mctct/feature_extraction_mctct.py
+++ b/src/transformers/models/deprecated/mctct/feature_extraction_mctct.py
@@ -20,11 +20,11 @@ from typing import List, Optional, Union
 
 import numpy as np
 
-from ...audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...file_utils import PaddingStrategy, TensorType
-from ...utils import logging
+from ....audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
+from ....feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ....feature_extraction_utils import BatchFeature
+from ....file_utils import PaddingStrategy, TensorType
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/mctct/modeling_mctct.py b/src/transformers/models/deprecated/mctct/modeling_mctct.py
similarity index 99%
rename from src/transformers/models/mctct/modeling_mctct.py
rename to src/transformers/models/deprecated/mctct/modeling_mctct.py
index 4b965b27ec..5b98a8a607 100755
--- a/src/transformers/models/mctct/modeling_mctct.py
+++ b/src/transformers/models/deprecated/mctct/modeling_mctct.py
@@ -22,17 +22,17 @@ import torch
 import torch.utils.checkpoint
 from torch import nn
 
-from ...activations import ACT2FN
-from ...deepspeed import is_deepspeed_zero3_enabled
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
-from ...modeling_outputs import BaseModelOutput, CausalLMOutput
-from ...modeling_utils import (
+from ....activations import ACT2FN
+from ....deepspeed import is_deepspeed_zero3_enabled
+from ....file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
+from ....modeling_outputs import BaseModelOutput, CausalLMOutput
+from ....modeling_utils import (
     PreTrainedModel,
     apply_chunking_to_forward,
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ....utils import logging
 from .configuration_mctct import MCTCTConfig
 
 
diff --git a/src/transformers/models/mctct/processing_mctct.py b/src/transformers/models/deprecated/mctct/processing_mctct.py
similarity index 99%
rename from src/transformers/models/mctct/processing_mctct.py
rename to src/transformers/models/deprecated/mctct/processing_mctct.py
index eb20fa09b3..764ed8d3db 100644
--- a/src/transformers/models/mctct/processing_mctct.py
+++ b/src/transformers/models/deprecated/mctct/processing_mctct.py
@@ -18,7 +18,7 @@ Speech processor class for M-CTC-T
 import warnings
 from contextlib import contextmanager
 
-from ...processing_utils import ProcessorMixin
+from ....processing_utils import ProcessorMixin
 
 
 class MCTCTProcessor(ProcessorMixin):
diff --git a/src/transformers/models/mmbt/__init__.py b/src/transformers/models/deprecated/mmbt/__init__.py
similarity index 94%
rename from src/transformers/models/mmbt/__init__.py
rename to src/transformers/models/deprecated/mmbt/__init__.py
index 29aee5a0cd..e467090cb4 100644
--- a/src/transformers/models/mmbt/__init__.py
+++ b/src/transformers/models/deprecated/mmbt/__init__.py
@@ -14,7 +14,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {"configuration_mmbt": ["MMBTConfig"]}
diff --git a/src/transformers/models/mmbt/configuration_mmbt.py b/src/transformers/models/deprecated/mmbt/configuration_mmbt.py
similarity index 98%
rename from src/transformers/models/mmbt/configuration_mmbt.py
rename to src/transformers/models/deprecated/mmbt/configuration_mmbt.py
index aa453db592..df5161b092 100644
--- a/src/transformers/models/mmbt/configuration_mmbt.py
+++ b/src/transformers/models/deprecated/mmbt/configuration_mmbt.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 """ MMBT configuration"""
 
-from ...utils import logging
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/mmbt/modeling_mmbt.py b/src/transformers/models/deprecated/mmbt/modeling_mmbt.py
similarity index 98%
rename from src/transformers/models/mmbt/modeling_mmbt.py
rename to src/transformers/models/deprecated/mmbt/modeling_mmbt.py
index 220cfb49c9..db0cef3a65 100644
--- a/src/transformers/models/mmbt/modeling_mmbt.py
+++ b/src/transformers/models/deprecated/mmbt/modeling_mmbt.py
@@ -20,9 +20,9 @@ import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from ...modeling_outputs import BaseModelOutputWithPooling, SequenceClassifierOutput
-from ...modeling_utils import ModuleUtilsMixin
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ....modeling_outputs import BaseModelOutputWithPooling, SequenceClassifierOutput
+from ....modeling_utils import ModuleUtilsMixin
+from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/retribert/__init__.py b/src/transformers/models/deprecated/retribert/__init__.py
similarity index 95%
rename from src/transformers/models/retribert/__init__.py
rename to src/transformers/models/deprecated/retribert/__init__.py
index c4f4bf6cc0..dba5e14594 100644
--- a/src/transformers/models/retribert/__init__.py
+++ b/src/transformers/models/deprecated/retribert/__init__.py
@@ -14,7 +14,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/retribert/configuration_retribert.py b/src/transformers/models/deprecated/retribert/configuration_retribert.py
similarity index 98%
rename from src/transformers/models/retribert/configuration_retribert.py
rename to src/transformers/models/deprecated/retribert/configuration_retribert.py
index 33663ad616..11d19193b3 100644
--- a/src/transformers/models/retribert/configuration_retribert.py
+++ b/src/transformers/models/deprecated/retribert/configuration_retribert.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """ RetriBERT model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/retribert/modeling_retribert.py b/src/transformers/models/deprecated/retribert/modeling_retribert.py
similarity index 98%
rename from src/transformers/models/retribert/modeling_retribert.py
rename to src/transformers/models/deprecated/retribert/modeling_retribert.py
index 240d9476e7..00d47bce51 100644
--- a/src/transformers/models/retribert/modeling_retribert.py
+++ b/src/transformers/models/deprecated/retribert/modeling_retribert.py
@@ -24,9 +24,9 @@ import torch
 import torch.utils.checkpoint as checkpoint
 from torch import nn
 
-from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, logging
-from ..bert.modeling_bert import BertModel
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_start_docstrings, logging
+from ...bert.modeling_bert import BertModel
 from .configuration_retribert import RetriBertConfig
 
 
diff --git a/src/transformers/models/retribert/tokenization_retribert.py b/src/transformers/models/deprecated/retribert/tokenization_retribert.py
similarity index 99%
rename from src/transformers/models/retribert/tokenization_retribert.py
rename to src/transformers/models/deprecated/retribert/tokenization_retribert.py
index b4a0f06192..4529e8e902 100644
--- a/src/transformers/models/retribert/tokenization_retribert.py
+++ b/src/transformers/models/deprecated/retribert/tokenization_retribert.py
@@ -19,8 +19,8 @@ import os
 import unicodedata
 from typing import List, Optional, Tuple
 
-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
-from ...utils import logging
+from ....tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/retribert/tokenization_retribert_fast.py b/src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py
similarity index 98%
rename from src/transformers/models/retribert/tokenization_retribert_fast.py
rename to src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py
index c242213e1f..30cb69c2b3 100644
--- a/src/transformers/models/retribert/tokenization_retribert_fast.py
+++ b/src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py
@@ -19,8 +19,8 @@ from typing import List, Optional, Tuple
 
 from tokenizers import normalizers
 
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ....tokenization_utils_fast import PreTrainedTokenizerFast
+from ....utils import logging
 from .tokenization_retribert import RetriBertTokenizer
 
 
diff --git a/src/transformers/models/tapex/__init__.py b/src/transformers/models/deprecated/tapex/__init__.py
similarity index 95%
rename from src/transformers/models/tapex/__init__.py
rename to src/transformers/models/deprecated/tapex/__init__.py
index f6d293504e..82bbacd15b 100644
--- a/src/transformers/models/tapex/__init__.py
+++ b/src/transformers/models/deprecated/tapex/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule
+from ....utils import _LazyModule
 
 
 _import_structure = {"tokenization_tapex": ["TapexTokenizer"]}
diff --git a/src/transformers/models/tapex/tokenization_tapex.py b/src/transformers/models/deprecated/tapex/tokenization_tapex.py
similarity index 99%
rename from src/transformers/models/tapex/tokenization_tapex.py
rename to src/transformers/models/deprecated/tapex/tokenization_tapex.py
index e2543a3378..d0cd49212c 100644
--- a/src/transformers/models/tapex/tokenization_tapex.py
+++ b/src/transformers/models/deprecated/tapex/tokenization_tapex.py
@@ -22,10 +22,10 @@ from typing import Dict, List, Optional, Tuple, Union
 
 import regex as re
 
-from ...file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
-from ...tokenization_utils_base import ENCODE_KWARGS_DOCSTRING, BatchEncoding, TextInput, TruncationStrategy
-from ...utils import logging
+from ....file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available
+from ....tokenization_utils import AddedToken, PreTrainedTokenizer
+from ....tokenization_utils_base import ENCODE_KWARGS_DOCSTRING, BatchEncoding, TextInput, TruncationStrategy
+from ....utils import logging
 
 
 if is_pandas_available():
diff --git a/src/transformers/models/trajectory_transformer/__init__.py b/src/transformers/models/deprecated/trajectory_transformer/__init__.py
similarity index 95%
rename from src/transformers/models/trajectory_transformer/__init__.py
rename to src/transformers/models/deprecated/trajectory_transformer/__init__.py
index d529275e5a..b7af1bb48c 100644
--- a/src/transformers/models/trajectory_transformer/__init__.py
+++ b/src/transformers/models/deprecated/trajectory_transformer/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py b/src/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py
similarity index 98%
rename from src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py
rename to src/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py
index 875980fde1..a64a0cbd89 100644
--- a/src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py
+++ b/src/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """ TrajectoryTransformer model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
similarity index 100%
rename from src/transformers/models/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
rename to src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
diff --git a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py b/src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
similarity index 99%
rename from src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py
rename to src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
index 1f634a9893..75415dbe77 100644
--- a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py
+++ b/src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
@@ -25,8 +25,8 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn import functional as F
 
-from ...modeling_utils import PreTrainedModel
-from ...utils import (
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
diff --git a/src/transformers/models/van/__init__.py b/src/transformers/models/deprecated/van/__init__.py
similarity index 93%
rename from src/transformers/models/van/__init__.py
rename to src/transformers/models/deprecated/van/__init__.py
index c696c5c5e5..2db730984f 100644
--- a/src/transformers/models/van/__init__.py
+++ b/src/transformers/models/deprecated/van/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {"configuration_van": ["VAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "VanConfig"]}
diff --git a/src/transformers/models/van/configuration_van.py b/src/transformers/models/deprecated/van/configuration_van.py
similarity index 98%
rename from src/transformers/models/van/configuration_van.py
rename to src/transformers/models/deprecated/van/configuration_van.py
index 85a0dd20e4..798c8c1944 100644
--- a/src/transformers/models/van/configuration_van.py
+++ b/src/transformers/models/deprecated/van/configuration_van.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """ VAN model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/van/convert_van_to_pytorch.py b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
similarity index 100%
rename from src/transformers/models/van/convert_van_to_pytorch.py
rename to src/transformers/models/deprecated/van/convert_van_to_pytorch.py
diff --git a/src/transformers/models/van/modeling_van.py b/src/transformers/models/deprecated/van/modeling_van.py
similarity index 98%
rename from src/transformers/models/van/modeling_van.py
rename to src/transformers/models/deprecated/van/modeling_van.py
index 59c8655aa9..f7feebae4d 100644
--- a/src/transformers/models/van/modeling_van.py
+++ b/src/transformers/models/deprecated/van/modeling_van.py
@@ -23,14 +23,14 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from ...activations import ACT2FN
-from ...modeling_outputs import (
+from ....activations import ACT2FN
+from ....modeling_outputs import (
     BaseModelOutputWithNoAttention,
     BaseModelOutputWithPoolingAndNoAttention,
     ImageClassifierOutputWithNoAttention,
 )
-from ...modeling_utils import PreTrainedModel
-from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_van import VanConfig
 
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 25a8943021..b583e8aad0 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2306,6 +2306,109 @@ class DeiTPreTrainedModel(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
+MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class MCTCTForCTC(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MCTCTModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MCTCTPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MMBTForClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MMBTModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ModalEmbeddings(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class RetriBertModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RetriBertPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TrajectoryTransformerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TrajectoryTransformerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+VAN_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class VanForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VanModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VanPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 DETA_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -4484,30 +4587,6 @@ class MBartPreTrainedModel(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
-MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class MCTCTForCTC(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MCTCTModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MCTCTPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 MEGA_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -4664,27 +4743,6 @@ class MgpstrPreTrainedModel(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
-class MMBTForClassification(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MMBTModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class ModalEmbeddings(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -6088,23 +6146,6 @@ class ResNetPreTrainedModel(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
-RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class RetriBertModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class RetriBertPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -7030,23 +7071,6 @@ class TimmBackbone(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
-TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class TrajectoryTransformerModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class TrajectoryTransformerPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -7276,30 +7300,6 @@ class UperNetPreTrainedModel(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
-VAN_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class VanForImageClassification(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class VanModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class VanPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py
index 5ab17d1348..80c6913874 100644
--- a/src/transformers/utils/dummy_tokenizers_objects.py
+++ b/src/transformers/utils/dummy_tokenizers_objects.py
@@ -107,6 +107,13 @@ class DebertaV2TokenizerFast(metaclass=DummyObject):
         requires_backends(self, ["tokenizers"])
 
 
+class RetriBertTokenizerFast(metaclass=DummyObject):
+    _backends = ["tokenizers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tokenizers"])
+
+
 class DistilBertTokenizerFast(metaclass=DummyObject):
     _backends = ["tokenizers"]
 
@@ -331,13 +338,6 @@ class RemBertTokenizerFast(metaclass=DummyObject):
         requires_backends(self, ["tokenizers"])
 
 
-class RetriBertTokenizerFast(metaclass=DummyObject):
-    _backends = ["tokenizers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tokenizers"])
-
-
 class RobertaTokenizerFast(metaclass=DummyObject):
     _backends = ["tokenizers"]
 
diff --git a/tests/models/bort/test_modeling_bort.py b/tests/models/bort/test_modeling_bort.py
deleted file mode 100644
index 79ca940801..0000000000
--- a/tests/models/bort/test_modeling_bort.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from transformers import is_torch_available
-from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import AutoModel
-
-
-@require_torch
-@require_sentencepiece
-@require_tokenizers
-class BortIntegrationTest(unittest.TestCase):
-    @slow
-    def test_output_embeds_base_model(self):
-        model = AutoModel.from_pretrained("amazon/bort")
-        model.to(torch_device)
-
-        input_ids = torch.tensor(
-            [[0, 18077, 4082, 7804, 8606, 6195, 2457, 3321, 11, 10489, 16, 269, 2579, 328, 2]],
-            device=torch_device,
-            dtype=torch.long,
-        )  # Schloß Nymphenburg in Munich is really nice!
-        output = model(input_ids)["last_hidden_state"]
-        expected_shape = torch.Size((1, 15, 1024))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[-0.0349, 0.0436, -1.8654], [-0.6964, 0.0835, -1.7393], [-0.9819, 0.2956, -0.2868]]],
-            device=torch_device,
-            dtype=torch.float,
-        )
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/bort/test_modeling_tf_bort.py b/tests/models/bort/test_modeling_tf_bort.py
deleted file mode 100644
index 35abe53d89..0000000000
--- a/tests/models/bort/test_modeling_tf_bort.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-
-
-if is_tf_available():
-    import numpy as np
-    import tensorflow as tf
-
-    from transformers import TFAutoModel
-
-
-@require_tf
-@require_sentencepiece
-@require_tokenizers
-class TFBortIntegrationTest(unittest.TestCase):
-    @slow
-    def test_output_embeds_base_model(self):
-        model = TFAutoModel.from_pretrained("amazon/bort")
-
-        input_ids = tf.convert_to_tensor(
-            [[0, 18077, 4082, 7804, 8606, 6195, 2457, 3321, 11, 10489, 16, 269, 2579, 328, 2]],
-            dtype=tf.int32,
-        )  # Schloß Nymphenburg in Munich is really nice!
-
-        output = model(input_ids)["last_hidden_state"]
-        expected_shape = tf.TensorShape((1, 15, 1024))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = tf.convert_to_tensor(
-            [[[-0.0349, 0.0436, -1.8654], [-0.6964, 0.0835, -1.7393], [-0.9819, 0.2956, -0.2868]]],
-            dtype=tf.float32,
-        )
-
-        self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
diff --git a/tests/models/mctct/__init__.py b/tests/models/mctct/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/models/mctct/test_feature_extraction_mctct.py b/tests/models/mctct/test_feature_extraction_mctct.py
deleted file mode 100644
index f1825c3640..0000000000
--- a/tests/models/mctct/test_feature_extraction_mctct.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import itertools
-import random
-import unittest
-
-import numpy as np
-
-from transformers import MCTCTFeatureExtractor
-from transformers.testing_utils import require_torch
-
-from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
-
-
-global_rng = random.Random()
-
-
-def floats_list(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    values = []
-    for _batch_idx in range(shape[0]):
-        values.append([])
-        for _ in range(shape[1]):
-            values[-1].append(rng.random() * scale)
-
-    return values
-
-
-@require_torch
-class MCTCTFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        min_seq_length=400,
-        max_seq_length=2000,
-        feature_size=24,
-        num_mel_bins=24,
-        padding_value=0.0,
-        sampling_rate=16_000,
-        return_attention_mask=True,
-        do_normalize=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.min_seq_length = min_seq_length
-        self.max_seq_length = max_seq_length
-        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
-        self.feature_size = feature_size
-        self.num_mel_bins = num_mel_bins
-        self.padding_value = padding_value
-        self.sampling_rate = sampling_rate
-        self.return_attention_mask = return_attention_mask
-        self.do_normalize = do_normalize
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "feature_size": self.feature_size,
-            "num_mel_bins": self.num_mel_bins,
-            "padding_value": self.padding_value,
-            "sampling_rate": self.sampling_rate,
-            "return_attention_mask": self.return_attention_mask,
-            "do_normalize": self.do_normalize,
-        }
-
-    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
-        def _flatten(list_of_lists):
-            return list(itertools.chain(*list_of_lists))
-
-        if equal_length:
-            speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
-        else:
-            # make sure that inputs increase in size
-            speech_inputs = [
-                floats_list((x, self.feature_size))
-                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
-            ]
-        if numpify:
-            speech_inputs = [np.asarray(x) for x in speech_inputs]
-        return speech_inputs
-
-
-@require_torch
-class MCTCTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-    feature_extraction_class = MCTCTFeatureExtractor
-
-    def setUp(self):
-        self.feat_extract_tester = MCTCTFeatureExtractionTester(self)
-
-    def _check_zero_mean_unit_variance(self, input_vector):
-        self.assertTrue(np.all(np.mean(input_vector) < 1e-3))
-        self.assertTrue(np.all(np.abs(np.var(input_vector) - 1) < 1e-3))
-
-    def test_call(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        # create three inputs of length 800, 1000, and 1200
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
-
-        # Test feature size
-        input_features = feature_extractor(np_speech_inputs, padding=True, return_tensors="np").input_features
-        self.assertTrue(input_features.ndim == 3)
-        self.assertTrue(input_features.shape[-1] == feature_extractor.feature_size)
-
-        # Test not batched input
-        encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
-        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
-
-        # Test batched
-        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-        # Test 2-D numpy arrays are batched.
-        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
-        np_speech_inputs = np.asarray(speech_inputs)
-        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-    def test_cepstral_mean_and_variance_normalization(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-
-        paddings = ["longest", "max_length", "do_not_pad"]
-        max_lengths = [None, 16, None]
-        for max_length, padding in zip(max_lengths, paddings):
-            inputs = feature_extractor(
-                speech_inputs,
-                padding=padding,
-                max_length=max_length,
-                return_attention_mask=True,
-                truncation=max_length is not None,  # reference to #16419
-            )
-            input_features = inputs.input_features
-            attention_mask = inputs.attention_mask
-            fbank_feat_lengths = [np.sum(x) for x in attention_mask]
-            self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]])
-            self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]])
-            self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]])
-
-    def test_cepstral_mean_and_variance_normalization_np(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-
-        paddings = ["longest", "max_length", "do_not_pad"]
-        max_lengths = [None, 16, None]
-        for max_length, padding in zip(max_lengths, paddings):
-            inputs = feature_extractor(
-                speech_inputs,
-                max_length=max_length,
-                padding=padding,
-                return_tensors="np",
-                return_attention_mask=True,
-                truncation=max_length is not None,
-            )
-            input_features = inputs.input_features
-            attention_mask = inputs.attention_mask
-            fbank_feat_lengths = [np.sum(x) for x in attention_mask]
-
-            self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]])
-            self.assertTrue(input_features[0][fbank_feat_lengths[0] :].sum() < 1e-6)
-            self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]])
-            self.assertTrue(input_features[0][fbank_feat_lengths[1] :].sum() < 1e-6)
-            self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]])
-
-    def test_cepstral_mean_and_variance_normalization_trunc_max_length(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-        inputs = feature_extractor(
-            speech_inputs,
-            padding="max_length",
-            max_length=4,
-            truncation=True,
-            return_tensors="np",
-            return_attention_mask=True,
-        )
-        input_features = inputs.input_features
-        attention_mask = inputs.attention_mask
-        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
-
-        self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
-        self._check_zero_mean_unit_variance(input_features[1])
-        self._check_zero_mean_unit_variance(input_features[2])
-
-    def test_cepstral_mean_and_variance_normalization_trunc_longest(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-        inputs = feature_extractor(
-            speech_inputs,
-            padding="longest",
-            max_length=4,
-            truncation=True,
-            return_tensors="np",
-            return_attention_mask=True,
-        )
-        input_features = inputs.input_features
-        attention_mask = inputs.attention_mask
-        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
-
-        self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
-        self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]])
-        self._check_zero_mean_unit_variance(input_features[2])
-
-        # make sure that if max_length < longest -> then pad to max_length
-        self.assertEqual(input_features.shape, (3, 4, 24))
-
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-        inputs = feature_extractor(
-            speech_inputs,
-            padding="longest",
-            max_length=16,
-            truncation=True,
-            return_tensors="np",
-            return_attention_mask=True,
-        )
-        input_features = inputs.input_features
-        attention_mask = inputs.attention_mask
-        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
-
-        self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
-        self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]])
-        self._check_zero_mean_unit_variance(input_features[2])
-
-        # make sure that if max_length < longest -> then pad to max_length
-        self.assertEqual(input_features.shape, (3, 16, 24))
-
-    def test_double_precision_pad(self):
-        import torch
-
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
-        py_speech_inputs = np_speech_inputs.tolist()
-
-        for inputs in [py_speech_inputs, np_speech_inputs]:
-            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
-            self.assertTrue(np_processed.input_features.dtype == np.float32)
-            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
-            self.assertTrue(pt_processed.input_features.dtype == torch.float32)
-
-    def test_different_window(self):
-        import torch
-
-        init_dict = self.feat_extract_tester.prepare_feat_extract_dict()
-        init_dict["win_function"] = "hann_window"
-
-        feature_extractor = self.feature_extraction_class(**init_dict)
-        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
-        py_speech_inputs = np_speech_inputs.tolist()
-
-        for inputs in [py_speech_inputs, np_speech_inputs]:
-            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
-            self.assertTrue(np_processed.input_features.dtype == np.float32)
-            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
-            self.assertTrue(pt_processed.input_features.dtype == torch.float32)
-
-    def _load_datasamples(self, num_samples):
-        from datasets import load_dataset
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_integration(self):
-        # fmt: off
-        expected = np.array([
-            [
-                1.1280,  1.1319,  1.2744,  1.4369,  1.4328,  1.3671,  1.2889,  1.3046,
-                1.4419,  0.8387,  0.2995,  0.0404,  0.1068,  0.0472,  0.3728,  1.3356,
-                1.4491,  0.4770,  0.3997,  0.2776,  0.3184, -0.1243, -0.1170, -0.0828
-            ],
-            [
-                1.0826,  1.0565,  1.2110,  1.3886,  1.3416,  1.2009,  1.1894,  1.2707,
-                1.5153,  0.7005,  0.4916,  0.4017,  0.3743,  0.1935,  0.4228,  1.1084,
-                0.9768,  0.0608,  0.2044,  0.1723,  0.0433, -0.2360, -0.2478, -0.2643
-            ],
-            [
-                1.0590,  0.9923,  1.1185,  1.3309,  1.1971,  1.0067,  1.0080,  1.2036,
-                1.5397,  1.0383,  0.7672,  0.7551,  0.4878,  0.8771,  0.7565,  0.8775,
-                0.9042,  0.4595,  0.6157,  0.4954,  0.1857,  0.0307,  0.0199,  0.1033
-            ],
-        ])
-        # fmt: on
-
-        input_speech = self._load_datasamples(1)
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        input_features = feature_extractor(input_speech, sampling_rate=16000, return_tensors="pt").input_features
-        self.assertTrue(np.allclose(input_features[0, 100:103], expected, atol=1e-4))
diff --git a/tests/models/mctct/test_modeling_mctct.py b/tests/models/mctct/test_modeling_mctct.py
deleted file mode 100644
index 21fadd633c..0000000000
--- a/tests/models/mctct/test_modeling_mctct.py
+++ /dev/null
@@ -1,651 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch MCTCT model. """
-
-import inspect
-import math
-import unittest
-
-from datasets import load_dataset
-
-from transformers import MCTCTConfig, is_torch_available
-from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import MCTCTForCTC, MCTCTModel, MCTCTProcessor
-
-
-class MCTCTModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=10,
-        seq_length=40,  # speech is longer
-        is_training=False,
-        vocab_size=32,
-        hidden_size=128 * 4,
-        num_hidden_layers=4,
-        intermediate_size=20,
-        num_attention_heads=4,
-        attention_head_dim=128,
-        max_position_embeddings=920,
-        layer_norm_eps=1e-5,
-        layerdrop=0.3,
-        hidden_act="relu",
-        initializer_range=0.02,
-        hidden_dropout_prob=0.3,
-        attention_probs_dropout_prob=0.3,
-        conv_glu_dim=1,
-        conv_dropout=0.3,
-        num_conv_layers=1,
-        conv_kernel=(7,),
-        conv_stride=(3,),
-        input_feat_per_channel=80,
-        input_channels=1,
-        conv_channels=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length  # speech is longer
-        self.is_training = is_training
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.intermediate_size = intermediate_size
-        self.num_attention_heads = num_attention_heads
-
-        self.attention_head_dim = attention_head_dim
-        self.max_position_embeddings = max_position_embeddings
-
-        self.layer_norm_eps = layer_norm_eps
-        self.layerdrop = layerdrop
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-
-        self.conv_glu_dim = conv_glu_dim
-        self.conv_dropout = conv_dropout
-        self.num_conv_layers = num_conv_layers
-        self.conv_kernel = conv_kernel
-        self.conv_stride = conv_stride
-        self.input_feat_per_channel = input_feat_per_channel
-        self.input_channels = input_channels
-        self.conv_channels = conv_channels
-
-        output_seq_length = self.seq_length
-        dilation = 1
-        for _, kernel_sz, stride in zip(range(self.num_conv_layers), self.conv_kernel, self.conv_stride):
-            padding = kernel_sz // 2
-            output_seq_length = output_seq_length + 2 * padding - dilation * (kernel_sz - 1) - 1
-            output_seq_length = torch.div(output_seq_length, stride, rounding_mode="trunc") + 1
-
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-    def prepare_config_and_inputs(self):
-        input_features = floats_tensor(
-            [self.batch_size, self.seq_length, self.input_feat_per_channel], self.vocab_size
-        )
-        attention_mask = torch.ones([self.batch_size, self.seq_length], dtype=torch.long, device=torch_device)
-
-        config = self.get_config()
-
-        return config, input_features, attention_mask
-
-    def get_config(self):
-        return MCTCTConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            intermediate_size=self.intermediate_size,
-            num_attention_heads=self.num_attention_heads,
-            attention_head_dim=self.attention_head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            layer_norm_eps=self.layer_norm_eps,
-            layerdrop=self.layerdrop,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            conv_glu_dim=self.conv_glu_dim,
-            conv_dropout=self.conv_dropout,
-            num_conv_layers=self.num_conv_layers,
-            conv_kernel=self.conv_kernel,
-            conv_stride=self.conv_stride,
-            input_feat_per_channel=self.input_feat_per_channel,
-            input_channels=self.input_channels,
-            conv_channels=self.conv_channels,
-        )
-
-    def create_and_check_model(self, config, input_features, attention_mask):
-        model = MCTCTModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_features, attention_mask=attention_mask)
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_for_ctc(self, config, input_features, attention_mask):
-        config.add_adapter = True
-        config.output_hidden_size = 2 * config.hidden_size
-        model = MCTCTForCTC(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_features, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.adapter_output_seq_length, self.vocab_size)
-        )
-
-    def create_and_check_batch_inference(self, config, input_features, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = MCTCTModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        input_features = input_features[:3]
-        attention_mask = torch.ones(input_features.shape[:-1], device=torch_device, dtype=torch.bool)
-
-        input_lengths = [input_features.shape[-1] // i for i in [2, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_features, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_features.shape[0]):
-            input_slice = input_features[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
-
-    def check_ctc_loss(self, config, input_features, *args):
-        model = MCTCTForCTC(config=config)
-        model.to(torch_device)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_features = input_features[:3]
-
-        # input_features is a 2D window for each sequence
-        attention_mask = torch.ones(input_features.shape[:-1], device=torch_device, dtype=torch.long)
-
-        # -2 since input_features is a 2D window for each sequence in batch
-        input_lengths = [input_features.shape[-2] // i for i in [2, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_features.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_features, attention_mask=attention_mask, labels=labels).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_features, attention_mask=attention_mask, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_ctc_training(self, config, input_features, *args):
-        config.ctc_zero_infinity = True
-        model = MCTCTForCTC(config=config)
-        model.to(torch_device)
-        model.train()
-
-        input_features = input_features[:3]
-
-        input_lengths = [input_features.shape[-2] // i for i in [2, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_features.shape[0], max(max_length_labels) - 1), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lenghts are at least
-                # one shorter than logit lenghts to prevent -inf
-                labels[i, max_length_labels[i] - 1 :] = -100
-
-        loss = model(input_features, labels=labels).loss
-        self.parent.assertFalse(torch.isinf(loss).item())
-
-        loss.backward()
-
-    def check_labels_out_of_vocab(self, config, input_features, *args):
-        model = MCTCTForCTC(config)
-        model.to(torch_device)
-        model.train()
-
-        input_features = input_features[:3]
-
-        input_lengths = [input_features.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_features.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100)
-
-        with self.parent.assertRaises(ValueError):
-            model(input_features, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_features, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_features": input_features, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class MCTCTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (MCTCTForCTC, MCTCTModel) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {"automatic-speech-recognition": MCTCTForCTC, "feature-extraction": MCTCTModel} if is_torch_available() else {}
-    )
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = MCTCTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MCTCTConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # MCTCT has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_features`
-    def test_forward_signature(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "input_features",
-                "attention_mask",
-                "head_mask",
-                "output_attentions",
-                "output_hidden_states",
-                "return_dict",
-            ]
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    # MCTCT cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # MCTCT has no inputs_embeds
-    def test_model_common_attributes(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-        config.layerdrop = 0.0
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        input_features = inputs_dict["input_features"]
-
-        input_lengths = torch.tensor(
-            [input_features.shape[1] for _ in range(input_features.shape[0])], dtype=torch.long, device=torch_device
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_features.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        hidden_states.retain_grad()
-        attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-        self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = MCTCTModel.from_pretrained("speechbrain/m-ctc-t-large")
-        self.assertIsNotNone(model)
-
-
-@require_torch
-class MCTCTRobustModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (MCTCTForCTC, MCTCTModel) if is_torch_available() else ()
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = MCTCTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MCTCTConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_batched_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_batch_inference(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # MCTCT has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_features`
-    def test_forward_signature(self):
-        pass
-
-    # MCTCT cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # MCTCT has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
-    def test_model_common_attributes(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        # set layer drop to 0
-        model.config.layerdrop = 0.0
-
-        input_features = inputs_dict["input_features"]
-
-        input_lengths = torch.tensor(
-            [input_features.shape[1] for _ in range(input_features.shape[0])], dtype=torch.long, device=torch_device
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_features.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        hidden_states.retain_grad()
-        attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-        self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = MCTCTModel.from_pretrained("speechbrain/m-ctc-t-large")
-        self.assertIsNotNone(model)
-
-
-@require_torch
-@require_soundfile
-@slow
-class MCTCTModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_inference_ctc_normal(self):
-        model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large")
-        model.to(torch_device)
-        processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large", do_lower_case=True)
-        input_speech = self._load_datasamples(1)
-
-        input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_features).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe, sir, i exist."]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_normal_batched(self):
-        model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large")
-        model.to(torch_device)
-        processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large", do_lower_case=True)
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
-
-        input_features = inputs.input_features.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_features, attention_mask=attention_mask).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe, sir, i exist.",
-            '"sweat-covered brion\'s body, trickling into the tight-lowing clossa was the only germent huor."',
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_robust_batched(self):
-        model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large").to(torch_device)
-        processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large", do_lower_case=True)
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True, return_attention_mask=True)
-
-        input_features = inputs.input_features.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_features, attention_mask=attention_mask).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe, sir, i exist.",
-            '"sweat-covered brion\'s body, trickling into the tight-lowing clossa was the only germent huor." "',
-            "\"the cadona's chest still-dripping bloodthe acofis overstrained eyes, even the soring arena around him"
-            " with thousands of spectators retrivialities not worth-thinking about.",
-            "his instant panic was followed by a small sharp blow high on his chestr.",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
diff --git a/tests/models/mctct/test_processor_mctct.py b/tests/models/mctct/test_processor_mctct.py
deleted file mode 100644
index 306d4b174f..0000000000
--- a/tests/models/mctct/test_processor_mctct.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import shutil
-import tempfile
-import unittest
-
-from transformers import MCTCTProcessor, is_speech_available, is_torch_available
-from transformers.file_utils import FEATURE_EXTRACTOR_NAME
-from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES, Wav2Vec2CTCTokenizer
-from transformers.testing_utils import require_torch, require_torchaudio
-
-
-if is_speech_available() and is_torch_available():
-    from transformers import MCTCTFeatureExtractor
-
-    from .test_feature_extraction_mctct import floats_list
-
-
-@require_torch
-@require_torchaudio
-class MCTCTProcessorTest(unittest.TestCase):
-    def setUp(self):
-        vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-
-        self.add_kwargs_tokens_map = {
-            "pad_token": "<pad>",
-            "unk_token": "<unk>",
-            "bos_token": "<s>",
-            "eos_token": "</s>",
-        }
-        feature_extractor_map = {
-            "feature_size": 1,
-            "padding_value": 0.0,
-            "sampling_rate": 16000,
-            "return_attention_mask": False,
-            "do_normalize": True,
-        }
-
-        self.tmpdirname = tempfile.mkdtemp()
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-
-        with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(feature_extractor_map) + "\n")
-
-    def get_tokenizer(self, **kwargs_init):
-        kwargs = self.add_kwargs_tokens_map.copy()
-        kwargs.update(kwargs_init)
-        return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_feature_extractor(self, **kwargs):
-        return MCTCTFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def test_save_load_pretrained_default(self):
-        tokenizer = self.get_tokenizer()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        processor.save_pretrained(self.tmpdirname)
-        processor = MCTCTProcessor.from_pretrained(self.tmpdirname)
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
-        self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
-
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, MCTCTFeatureExtractor)
-
-    def test_save_load_pretrained_additional_features(self):
-        processor = MCTCTProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
-        processor.save_pretrained(self.tmpdirname)
-
-        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
-
-        processor = MCTCTProcessor.from_pretrained(
-            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
-        )
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
-
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, MCTCTFeatureExtractor)
-
-    def test_feature_extractor(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        raw_speech = floats_list((3, 1000))
-
-        input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
-        input_processor = processor(raw_speech, return_tensors="np")
-
-        for key in input_feat_extract.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    def test_tokenizer(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        input_str = "This is a test string"
-
-        encoded_processor = processor(text=input_str)
-
-        encoded_tok = tokenizer(input_str)
-
-        for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key])
-
-    def test_tokenizer_decode(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
-
-        decoded_processor = processor.batch_decode(predicted_ids)
-        decoded_tok = tokenizer.batch_decode(predicted_ids)
-
-        self.assertListEqual(decoded_tok, decoded_processor)
-
-    def test_model_input_names(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        self.assertListEqual(
-            processor.model_input_names,
-            feature_extractor.model_input_names,
-            msg="`processor` and `feature_extractor` model input names do not match",
-        )
diff --git a/tests/models/retribert/__init__.py b/tests/models/retribert/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/models/retribert/test_tokenization_retribert.py b/tests/models/retribert/test_tokenization_retribert.py
deleted file mode 100644
index 25b3df6f3e..0000000000
--- a/tests/models/retribert/test_tokenization_retribert.py
+++ /dev/null
@@ -1,381 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the RetriBERT tokenizer. """
-
-
-import os
-import unittest
-
-from transformers import RetriBertTokenizer, RetriBertTokenizerFast
-from transformers.models.bert.tokenization_bert import (
-    VOCAB_FILES_NAMES,
-    BasicTokenizer,
-    WordpieceTokenizer,
-    _is_control,
-    _is_punctuation,
-    _is_whitespace,
-)
-from transformers.testing_utils import require_tokenizers, require_torch, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english, merge_model_tokenizer_mappings
-
-
-# Copied from transformers.tests.bert.test_modeling_bert.py with Bert->RetriBert
-@require_tokenizers
-class RetriBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    tokenizer_class = RetriBertTokenizer
-    test_slow_tokenizer = True
-    rust_tokenizer_class = RetriBertTokenizerFast
-    test_rust_tokenizer = True
-    space_between_special_tokens = True
-    from_pretrained_filter = filter_non_english
-
-    def setUp(self):
-        super().setUp()
-
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "[PAD]",
-            "[MASK]",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00E9d,running"
-        output_text = "unwanted, running"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file)
-
-        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
-        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            return
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "UNwant\u00E9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-        # With lower casing
-        tokenizer = self.get_tokenizer(do_lower_case=True)
-        rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
-
-        sequence = "UNwant\u00E9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_chinese(self):
-        tokenizer = BasicTokenizer()
-
-        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
-
-    def test_basic_tokenizer_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
-
-    def test_basic_tokenizer_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_default(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
-
-    def test_basic_tokenizer_no_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_respects_never_split_tokens(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
-        )
-
-    def test_wordpiece_tokenizer(self):
-        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
-
-        vocab = {}
-        for i, token in enumerate(vocab_tokens):
-            vocab[token] = i
-        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
-
-        self.assertListEqual(tokenizer.tokenize(""), [])
-
-        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
-
-        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
-
-    def test_is_whitespace(self):
-        self.assertTrue(_is_whitespace(" "))
-        self.assertTrue(_is_whitespace("\t"))
-        self.assertTrue(_is_whitespace("\r"))
-        self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00A0"))
-
-        self.assertFalse(_is_whitespace("A"))
-        self.assertFalse(_is_whitespace("-"))
-
-    def test_is_control(self):
-        self.assertTrue(_is_control("\u0005"))
-
-        self.assertFalse(_is_control("A"))
-        self.assertFalse(_is_control(" "))
-        self.assertFalse(_is_control("\t"))
-        self.assertFalse(_is_control("\r"))
-
-    def test_is_punctuation(self):
-        self.assertTrue(_is_punctuation("-"))
-        self.assertTrue(_is_punctuation("$"))
-        self.assertTrue(_is_punctuation("`"))
-        self.assertTrue(_is_punctuation("."))
-
-        self.assertFalse(_is_punctuation("A"))
-        self.assertFalse(_is_punctuation(" "))
-
-    def test_clean_text(self):
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
-        self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])
-
-        self.assertListEqual(
-            [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
-        )
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("yjernite/retribert-base-uncased")
-
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [101] + text + [102]
-        assert encoded_pair == [101] + text + [102] + text_2 + [102]
-
-    def test_offsets_with_special_characters(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
-                tokens = tokenizer_r.encode_plus(
-                    sentence,
-                    return_attention_mask=False,
-                    return_token_type_ids=False,
-                    return_offsets_mapping=True,
-                    add_special_tokens=True,
-                )
-
-                do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
-                expected_results = (
-                    [
-                        ((0, 0), tokenizer_r.cls_token),
-                        ((0, 1), "A"),
-                        ((1, 2), ","),
-                        ((3, 5), "na"),
-                        ((5, 6), "##ï"),
-                        ((6, 8), "##ve"),
-                        ((9, 15), tokenizer_r.mask_token),
-                        ((16, 21), "Allen"),
-                        ((21, 23), "##NL"),
-                        ((23, 24), "##P"),
-                        ((25, 33), "sentence"),
-                        ((33, 34), "."),
-                        ((0, 0), tokenizer_r.sep_token),
-                    ]
-                    if not do_lower_case
-                    else [
-                        ((0, 0), tokenizer_r.cls_token),
-                        ((0, 1), "a"),
-                        ((1, 2), ","),
-                        ((3, 8), "naive"),
-                        ((9, 15), tokenizer_r.mask_token),
-                        ((16, 21), "allen"),
-                        ((21, 23), "##nl"),
-                        ((23, 24), "##p"),
-                        ((25, 33), "sentence"),
-                        ((33, 34), "."),
-                        ((0, 0), tokenizer_r.sep_token),
-                    ]
-                )
-
-                self.assertEqual(
-                    [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
-                )
-                self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
-
-    def test_change_tokenize_chinese_chars(self):
-        list_of_commun_chinese_char = ["的", "人", "有"]
-        text_with_chinese_char = "".join(list_of_commun_chinese_char)
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                kwargs["tokenize_chinese_chars"] = True
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
-                ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
-
-                tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
-                tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
-
-                # it is expected that each Chinese character is not preceded by "##"
-                self.assertListEqual(tokens_without_spe_char_p, list_of_commun_chinese_char)
-                self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
-
-                kwargs["tokenize_chinese_chars"] = False
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
-                ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
-
-                tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
-                tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
-
-                # it is expected that only the first Chinese character is not preceded by "##".
-                expected_tokens = [
-                    f"##{token}" if idx != 0 else token for idx, token in enumerate(list_of_commun_chinese_char)
-                ]
-                self.assertListEqual(tokens_without_spe_char_p, expected_tokens)
-                self.assertListEqual(tokens_without_spe_char_r, expected_tokens)
-
-    # RetriBertModel doesn't define `get_input_embeddings` and it's forward method doesn't take only the output of the tokenizer as input
-    @require_torch
-    @slow
-    def test_torch_encode_plus_sent_to_model(self):
-        import torch
-
-        from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
-
-        MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
-
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
-
-                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
-                config = config_class()
-
-                if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
-
-                model = model_class(config)
-
-                # The following test is different from the common's one
-                self.assertGreaterEqual(model.bert_query.get_input_embeddings().weight.shape[0], len(tokenizer))
-
-                # Build sequence
-                first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
-                sequence = " ".join(first_ten_tokens)
-                encoded_sequence = tokenizer.encode_plus(sequence, return_tensors="pt")
-
-                # Ensure that the BatchEncoding.to() method works.
-                encoded_sequence.to(model.device)
-
-                batch_encoded_sequence = tokenizer.batch_encode_plus([sequence, sequence], return_tensors="pt")
-                # This should not fail
-
-                with torch.no_grad():  # saves some time
-                    # The following lines are different from the common's ones
-                    model.embed_questions(**encoded_sequence)
-                    model.embed_questions(**batch_encoded_sequence)
diff --git a/tests/models/tapex/__init__.py b/tests/models/tapex/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/models/tapex/test_tokenization_tapex.py b/tests/models/tapex/test_tokenization_tapex.py
deleted file mode 100644
index 9bc61acb7f..0000000000
--- a/tests/models/tapex/test_tokenization_tapex.py
+++ /dev/null
@@ -1,904 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import shutil
-import tempfile
-import unittest
-from typing import List
-
-import pandas as pd
-
-from transformers import AddedToken, TapexTokenizer
-from transformers.models.tapex.tokenization_tapex import VOCAB_FILES_NAMES
-from transformers.testing_utils import is_pt_tf_cross_test, require_pandas, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-@require_pandas
-class TapexTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    tokenizer_class = TapexTokenizer
-    test_rust_tokenizer = False
-    from_pretrained_kwargs = {"cls_token": "<s>"}
-    test_seq2seq = False
-
-    def setUp(self):
-        super().setUp()
-
-        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        # fmt: off
-        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]  # noqa: E231
-        # fmt: on
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    def get_table(self, tokenizer, length=5):
-        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
-
-        if length == 0:
-            data = {}
-        else:
-            data = {toks[0]: [toks[tok] for tok in range(1, length)]}
-
-        table = pd.DataFrame.from_dict(data)
-
-        return table
-
-    def get_table_and_query(self, tokenizer, length=5):
-        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
-        table = self.get_table(tokenizer, length=length - 3)
-        query = " ".join(toks[:3])
-
-        return table, query
-
-    def get_clean_sequence(
-        self,
-        tokenizer,
-        with_prefix_space=False,
-        max_length=20,
-        min_length=5,
-        empty_table: bool = False,
-        add_special_tokens: bool = True,
-        return_table_and_query: bool = False,
-    ):
-        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
-
-        if empty_table:
-            table = pd.DataFrame.from_dict({})
-            query = " ".join(toks[:min_length])
-        else:
-            data = {toks[0]: [toks[tok] for tok in range(1, min_length - 3)]}
-            table = pd.DataFrame.from_dict(data)
-            query = " ".join(toks[:3])
-
-        output_ids = tokenizer.encode(table, query, add_special_tokens=add_special_tokens)
-        output_txt = tokenizer.decode(output_ids)
-
-        if len(output_ids) < min_length:
-            raise ValueError("Update the code to generate the sequences so that they are larger")
-        if len(output_ids) > max_length:
-            raise ValueError("Update the code to generate the sequences so that they are smaller")
-
-        if return_table_and_query:
-            return output_txt, output_ids, table, query
-
-        return output_txt, output_ids
-
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "lower newer"
-        output_text = "lower newer"
-        return input_text, output_text
-
-    def test_full_tokenizer_roberta(self):
-        tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map)
-        text = "lower newer"
-        bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    def roberta_dict_integration_testing(self):
-        tokenizer = self.get_tokenizer()
-
-        self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2])
-        self.assertListEqual(
-            tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False),
-            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2],
-        )
-
-    def test_add_tokens_tokenizer(self):
-        tokenizers: List[TapexTokenizer] = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                vocab_size = tokenizer.vocab_size
-                all_size = len(tokenizer)
-
-                self.assertNotEqual(vocab_size, 0)
-
-                # We usually have added tokens from the start in tests because our vocab fixtures are
-                # smaller than the original vocabs - let's not assert this
-                # self.assertEqual(vocab_size, all_size)
-
-                new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
-                added_toks = tokenizer.add_tokens(new_toks)
-                vocab_size_2 = tokenizer.vocab_size
-                all_size_2 = len(tokenizer)
-
-                self.assertNotEqual(vocab_size_2, 0)
-                self.assertEqual(vocab_size, vocab_size_2)
-                self.assertEqual(added_toks, len(new_toks))
-                self.assertEqual(all_size_2, all_size + len(new_toks))
-
-                tokens = tokenizer.encode(table, "aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
-
-                self.assertGreaterEqual(len(tokens), 4)
-                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-
-                new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
-                added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
-                vocab_size_3 = tokenizer.vocab_size
-                all_size_3 = len(tokenizer)
-
-                self.assertNotEqual(vocab_size_3, 0)
-                self.assertEqual(vocab_size, vocab_size_3)
-                self.assertEqual(added_toks_2, len(new_toks_2))
-                self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
-
-                tokens = tokenizer.encode(
-                    table,
-                    ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
-                    add_special_tokens=False,
-                )
-
-                self.assertGreaterEqual(len(tokens), 6)
-                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[0], tokens[1])
-                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[-2], tokens[-3])
-                self.assertEqual(tokens[0], tokenizer.eos_token_id)
-                self.assertEqual(tokens[-2], tokenizer.pad_token_id)
-
-    def test_token_type_ids(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                empty_table = self.get_table(tokenizer, length=0)
-                seq_0 = "Test this method."
-
-                # We want to have sequence 0 and sequence 1 are tagged
-                # respectively with 0 and 1 token_ids
-                # (regardless of whether the model use token type ids)
-                # We use this assumption in the QA pipeline among other place
-                output = tokenizer(empty_table, seq_0, return_token_type_ids=True)
-
-                # Assert that the token type IDs have the same length as the input IDs
-                self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
-                self.assertIn(0, output["token_type_ids"])
-
-    def test_add_special_tokens(self):
-        tokenizers: List[TapexTokenizer] = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                input_table = self.get_table(tokenizer, length=0)
-
-                special_token = "[SPECIAL_TOKEN]"
-
-                tokenizer.add_special_tokens({"cls_token": special_token})
-                encoded_special_token = tokenizer.encode(input_table, special_token, add_special_tokens=False)
-                self.assertEqual(len(encoded_special_token), 1)
-
-                decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
-                self.assertTrue(special_token not in decoded)
-
-    def test_batch_encode_plus_overflowing_tokens(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            table = self.get_table(tokenizer, length=10)
-            string_sequences = ["Testing the prepare_for_model method.", "Test"]
-
-            if tokenizer.pad_token is None:
-                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-
-            tokenizer.batch_encode_plus(
-                table, string_sequences, return_overflowing_tokens=True, truncation=True, padding=True, max_length=3
-            )
-
-    @is_pt_tf_cross_test
-    def test_batch_encode_plus_tensors(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                table = self.get_table(tokenizer, length=0)
-
-                # A Tensor cannot be build by sequences which are not the same size
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="pt")
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="tf")
-
-                if tokenizer.pad_token_id is None:
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        table,
-                        sequences,
-                        padding=True,
-                        return_tensors="pt",
-                    )
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        table,
-                        sequences,
-                        padding="longest",
-                        return_tensors="tf",
-                    )
-                else:
-                    pytorch_tensor = tokenizer.batch_encode_plus(table, sequences, padding=True, return_tensors="pt")
-                    tensorflow_tensor = tokenizer.batch_encode_plus(
-                        table, sequences, padding="longest", return_tensors="tf"
-                    )
-                    encoded_sequences = tokenizer.batch_encode_plus(table, sequences, padding=True)
-
-                    for key in encoded_sequences.keys():
-                        pytorch_value = pytorch_tensor[key].tolist()
-                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
-                        encoded_value = encoded_sequences[key]
-
-                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
-
-    def test_call(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                # Test not batched
-                table = self.get_table(tokenizer, length=0)
-                encoded_sequences_1 = tokenizer.encode_plus(table, sequences[0])
-                encoded_sequences_2 = tokenizer(table, sequences[0])
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-                # Test not batched pairs
-                table = self.get_table(tokenizer, length=10)
-                encoded_sequences_1 = tokenizer.encode_plus(table, sequences[1])
-                encoded_sequences_2 = tokenizer(table, sequences[1])
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-                # Test batched
-                table = self.get_table(tokenizer, length=0)
-                encoded_sequences_1 = tokenizer.batch_encode_plus(table, sequences)
-                encoded_sequences_2 = tokenizer(table, sequences)
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-    def test_internal_consistency(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                input_text, output_text = self.get_input_output_texts(tokenizer)
-
-                tokens = tokenizer.tokenize(input_text)
-                ids = tokenizer.convert_tokens_to_ids(tokens)
-                ids_2 = tokenizer.encode(table, input_text, add_special_tokens=False)
-                self.assertListEqual(ids, ids_2)
-
-                tokens_2 = tokenizer.convert_ids_to_tokens(ids)
-                self.assertNotEqual(len(tokens_2), 0)
-                text_2 = tokenizer.decode(ids)
-                self.assertIsInstance(text_2, str)
-
-                self.assertEqual(text_2, output_text)
-
-    def test_save_and_load_tokenizer(self):
-        # safety check on max_len default value so we are sure the test works
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                self.assertNotEqual(tokenizer.model_max_length, 42)
-
-        # Now let's start the test
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # Isolate this from the other tests because we save additional tokens/etc
-                table = self.get_table(tokenizer, length=0)
-                tmpdirname = tempfile.mkdtemp()
-
-                sample_text = " He is very happy, UNwant\u00E9d,running"
-                before_tokens = tokenizer.encode(table, sample_text, add_special_tokens=False)
-                before_vocab = tokenizer.get_vocab()
-                tokenizer.save_pretrained(tmpdirname)
-
-                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
-                after_tokens = after_tokenizer.encode(table, sample_text, add_special_tokens=False)
-                after_vocab = after_tokenizer.get_vocab()
-                self.assertListEqual(before_tokens, after_tokens)
-                self.assertDictEqual(before_vocab, after_vocab)
-
-                shutil.rmtree(tmpdirname)
-
-    def test_number_of_added_tokens(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table, query = self.get_table_and_query(tokenizer)
-
-                sequences = tokenizer.encode(table, query, add_special_tokens=False)
-                attached_sequences = tokenizer.encode(table, query, add_special_tokens=True)
-
-                self.assertEqual(2, len(attached_sequences) - len(sequences))
-
-    @unittest.skip("TAPEX cannot handle `prepare_for_model` without passing by `encode_plus` or `batch_encode_plus`")
-    def test_prepare_for_model(self):
-        pass
-
-    @unittest.skip("TAPEX tokenizer does not support pairs.")
-    def test_maximum_encoding_length_pair_input(self):
-        pass
-
-    @unittest.skip("TAPEX tokenizer does not support pairs.")
-    def test_maximum_encoding_length_single_input(self):
-        pass
-
-    @unittest.skip("Not implemented")
-    def test_right_and_left_truncation(self):
-        pass
-
-    def test_encode_decode_with_spaces(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-
-                new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
-                tokenizer.add_tokens(new_toks)
-                input = "[ABC][DEF][ABC][DEF]"
-                if self.space_between_special_tokens:
-                    output = "[ABC] [DEF] [ABC] [DEF]"
-                else:
-                    output = input
-                encoded = tokenizer.encode(table, input, add_special_tokens=False)
-                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
-                self.assertIn(decoded, [output, output.lower()])
-
-    def test_tokenize_special_tokens(self):
-        """Test `tokenize` with special tokens."""
-        tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                SPECIAL_TOKEN_1 = "[SPECIAL_TOKEN_1]"
-                SPECIAL_TOKEN_2 = "[SPECIAL_TOKEN_2]"
-
-                # TODO:
-                # Can we combine `unique_no_split_tokens` and `all_special_tokens`(and properties related to it)
-                # with one variable(property) for a better maintainability?
-
-                # `add_tokens` method stores special tokens only in `tokenizer.unique_no_split_tokens`. (in tokenization_utils.py)
-                tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True)
-                # `add_special_tokens` method stores special tokens in `tokenizer.additional_special_tokens`,
-                # which also occur in `tokenizer.all_special_tokens`. (in tokenization_utils_base.py)
-                tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_TOKEN_2]})
-
-                token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1)
-                token_2 = tokenizer.tokenize(SPECIAL_TOKEN_2)
-
-                self.assertEqual(len(token_1), 1)
-                self.assertEqual(len(token_2), 1)
-                self.assertEqual(token_1[0], SPECIAL_TOKEN_1)
-                self.assertEqual(token_2[0], SPECIAL_TOKEN_2)
-
-    def test_special_tokens_mask(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequence_0 = "Encode this."
-                # Testing single inputs
-                encoded_sequence = tokenizer.encode(table, sequence_0, add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus(
-                    table, sequence_0, add_special_tokens=True, return_special_tokens_mask=True
-                )
-                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
-
-                filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]]
-                self.assertEqual(encoded_sequence, filtered_sequence)
-
-    def test_padding_to_max_length(self):
-        """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer)
-                sequence = "Sequence"
-                padding_size = 10
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_idx = tokenizer.pad_token_id
-
-                # Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "right"
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-                padded_sequence = tokenizer.encode(
-                    table,
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    pad_to_max_length=True,
-                )
-                padded_sequence_length = len(padded_sequence)
-                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
-                self.assertListEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
-
-                # Check that nothing is done when a maximum length is not specified
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode(table, sequence, pad_to_max_length=True)
-                padded_sequence_right_length = len(padded_sequence_right)
-                self.assertEqual(sequence_length, padded_sequence_right_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_right)
-
-    def test_padding_to_multiple_of(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
-                else:
-                    empty_tokens = tokenizer(table, padding=True, pad_to_multiple_of=8)
-                    normal_tokens = tokenizer(table, "This is a sample input", padding=True, pad_to_multiple_of=8)
-                    for key, value in empty_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-                    for key, value in normal_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-                    normal_tokens = tokenizer(table, "This", pad_to_multiple_of=8)
-                    for key, value in normal_tokens.items():
-                        self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-                    # Should also work with truncation
-                    normal_tokens = tokenizer(table, "This", padding=True, truncation=True, pad_to_multiple_of=8)
-                    for key, value in normal_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-    def test_right_and_left_padding(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequence = "Sequence"
-                padding_size = 10
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_idx = tokenizer.pad_token_id
-
-                # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "right"
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-                padded_sequence = tokenizer.encode(
-                    table, sequence, max_length=sequence_length + padding_size, padding="max_length"
-                )
-                padded_sequence_length = len(padded_sequence)
-                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
-                self.assertListEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
-
-                # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "left"
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-                padded_sequence = tokenizer.encode(
-                    table, sequence, max_length=sequence_length + padding_size, padding="max_length"
-                )
-                padded_sequence_length = len(padded_sequence)
-                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
-                self.assertListEqual([padding_idx] * padding_size + encoded_sequence, padded_sequence)
-
-                # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode(table, sequence, padding=True)
-                padded_sequence_right_length = len(padded_sequence_right)
-                self.assertEqual(sequence_length, padded_sequence_right_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_right)
-
-                tokenizer.padding_side = "left"
-                padded_sequence_left = tokenizer.encode(table, sequence, padding="longest")
-                padded_sequence_left_length = len(padded_sequence_left)
-                self.assertEqual(sequence_length, padded_sequence_left_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_left)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode(table, sequence)
-                padded_sequence_right_length = len(padded_sequence_right)
-                self.assertEqual(sequence_length, padded_sequence_right_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_right)
-
-                tokenizer.padding_side = "left"
-                padded_sequence_left = tokenizer.encode(table, sequence, padding=False)
-                padded_sequence_left_length = len(padded_sequence_left)
-                self.assertEqual(sequence_length, padded_sequence_left_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_left)
-
-    def test_encode_plus_with_padding(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequence = "Sequence"
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_size = 10
-                padding_idx = tokenizer.pad_token_id
-                token_type_padding_idx = tokenizer.pad_token_type_id
-
-                encoded_sequence = tokenizer.encode_plus(table, sequence, return_special_tokens_mask=True)
-                input_ids = encoded_sequence["input_ids"]
-                special_tokens_mask = encoded_sequence["special_tokens_mask"]
-                sequence_length = len(input_ids)
-
-                # Test 'longest' and 'no_padding' don't do anything
-                tokenizer.padding_side = "right"
-
-                not_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    padding=False,
-                    return_special_tokens_mask=True,
-                )
-                not_padded_input_ids = not_padded_sequence["input_ids"]
-
-                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
-                not_padded_sequence_length = len(not_padded_input_ids)
-
-                self.assertEqual(sequence_length, not_padded_sequence_length)
-                self.assertListEqual(input_ids, not_padded_input_ids)
-                self.assertListEqual(special_tokens_mask, not_padded_special_tokens_mask)
-
-                not_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    padding=False,
-                    return_special_tokens_mask=True,
-                )
-                not_padded_input_ids = not_padded_sequence["input_ids"]
-
-                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
-                not_padded_sequence_length = len(not_padded_input_ids)
-
-                self.assertEqual(sequence_length, not_padded_sequence_length)
-                self.assertListEqual(input_ids, not_padded_input_ids)
-                self.assertListEqual(special_tokens_mask, not_padded_special_tokens_mask)
-
-                # Test right padding
-                tokenizer.padding_side = "right"
-
-                right_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
-                right_padded_input_ids = right_padded_sequence["input_ids"]
-
-                right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
-                right_padded_sequence_length = len(right_padded_input_ids)
-
-                self.assertEqual(sequence_length + padding_size, right_padded_sequence_length)
-                self.assertListEqual(input_ids + [padding_idx] * padding_size, right_padded_input_ids)
-                self.assertListEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask)
-
-                # Test left padding
-                tokenizer.padding_side = "left"
-                left_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
-                left_padded_input_ids = left_padded_sequence["input_ids"]
-                left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
-                left_padded_sequence_length = len(left_padded_input_ids)
-
-                self.assertEqual(sequence_length + padding_size, left_padded_sequence_length)
-                self.assertListEqual([padding_idx] * padding_size + input_ids, left_padded_input_ids)
-                self.assertListEqual([1] * padding_size + special_tokens_mask, left_padded_special_tokens_mask)
-
-                if "token_type_ids" in tokenizer.model_input_names:
-                    token_type_ids = encoded_sequence["token_type_ids"]
-                    left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
-                    right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
-
-                    self.assertListEqual(
-                        (token_type_ids + [[token_type_padding_idx] * 7] * padding_size, right_padded_token_type_ids)
-                    )
-                    self.assertListEqual(
-                        [[token_type_padding_idx] * 7] * padding_size + token_type_ids, left_padded_token_type_ids
-                    )
-
-                if "attention_mask" in tokenizer.model_input_names:
-                    attention_mask = encoded_sequence["attention_mask"]
-                    right_padded_attention_mask = right_padded_sequence["attention_mask"]
-                    left_padded_attention_mask = left_padded_sequence["attention_mask"]
-
-                    self.assertListEqual(attention_mask + [0] * padding_size, right_padded_attention_mask)
-                    self.assertListEqual([0] * padding_size + attention_mask, left_padded_attention_mask)
-
-    def test_batch_encode_plus_padding(self):
-        # Test that padded sequences are equivalent between batch_encode_plus and encode_plus
-
-        # Right padding tests
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                max_length = 100
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequences)
-
-                encoded_sequences = [
-                    tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
-                    for sequence in sequences
-                ]
-                encoded_sequences_batch = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=max_length, padding="max_length"
-                )
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-        # Left padding tests
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                tokenizer.padding_side = "left"
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                max_length = 100
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequences)
-
-                encoded_sequences = [
-                    tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
-                    for sequence in sequences
-                ]
-                encoded_sequences_batch = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=max_length, padding="max_length"
-                )
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-    def test_batch_encode_plus_batch_sequence_length(self):
-        # Tests that all encoded values have the correct size
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                encoded_sequences = [tokenizer.encode_plus(table, sequence) for sequence in sequences]
-                encoded_sequences_batch = tokenizer.batch_encode_plus(table, sequences, padding=False)
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-                maximum_length = len(
-                    max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)
-                )
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequences)
-
-                encoded_sequences_padded = [
-                    tokenizer.encode_plus(table, sequence, max_length=maximum_length, padding="max_length")
-                    for sequence in sequences
-                ]
-
-                encoded_sequences_batch_padded = tokenizer.batch_encode_plus(table, sequences, padding=True)
-                self.assertListEqual(
-                    encoded_sequences_padded,
-                    self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
-                )
-
-                # check 'longest' is unsensitive to a max length
-                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=True)
-                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=maximum_length + 10, padding="longest"
-                )
-                for key in encoded_sequences_batch_padded_1.keys():
-                    self.assertListEqual(
-                        encoded_sequences_batch_padded_1[key],
-                        encoded_sequences_batch_padded_2[key],
-                    )
-
-                # check 'no_padding' is unsensitive to a max length
-                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=False)
-                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=maximum_length + 10, padding=False
-                )
-                for key in encoded_sequences_batch_padded_1.keys():
-                    self.assertListEqual(
-                        encoded_sequences_batch_padded_1[key],
-                        encoded_sequences_batch_padded_2[key],
-                    )
-
-    def test_special_tokens_mask_input_pairs(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequence_0 = "Encode this."
-                empty_table = self.get_table(tokenizer, length=0)
-                table = self.get_table(tokenizer, length=10)
-                encoded_sequence = tokenizer.encode(empty_table, sequence_0, add_special_tokens=False)
-                number_of_tokens = len(encoded_sequence)
-                encoded_sequence += tokenizer.encode(table, "", add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus(
-                    table,
-                    sequence_0,
-                    add_special_tokens=True,
-                    return_special_tokens_mask=True,
-                )
-                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
-
-                filtered_sequence = [
-                    (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
-                ]
-                # NOTE: as TAPEX adds a space between a table and a sequence, we need to remove it
-                # in order to have equivalent results with encoding an empty table or empty sequence
-                del filtered_sequence[number_of_tokens + 1]
-                filtered_sequence = [x for x in filtered_sequence if x is not None]
-                print("Encoded sequence:", encoded_sequence)
-                print("Filtered sequence:", filtered_sequence)
-                self.assertEqual(encoded_sequence, filtered_sequence)
-
-    @slow
-    def test_full_tokenizer(self):
-        question = "Greece held its last Summer Olympics in 2004"
-        table_dict = {
-            "header": ["Year", "City", "Country", "Nations"],
-            "rows": [
-                [1896, "Athens", "Greece", 14],
-                [1900, "Paris", "France", 24],
-                [1904, "St. Louis", "USA", 12],
-                [2004, "Athens", "Greece", 201],
-                [2008, "Beijing", "China", 204],
-                [2012, "London", "UK", 204],
-            ],
-        }
-        table = pd.DataFrame.from_dict(table_dict["rows"])
-        table.columns = table_dict["header"]
-
-        tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
-        encoding = tokenizer(table, question)
-
-        # fmt: off
-        expected_results = {'input_ids': [0, 821, 5314, 1755, 547, 63, 94, 1035, 1021, 31434, 2857, 11, 4482, 11311, 4832, 76, 1721, 343, 1721, 247, 1721, 3949, 3236, 112, 4832, 42773, 1721, 23, 27859, 1721, 821, 5314, 1755, 1721, 501, 3236, 132, 4832, 23137, 1721, 2242, 354, 1721, 6664, 2389, 1721, 706, 3236, 155, 4832, 42224, 1721, 1690, 4, 26120, 354, 1721, 201, 102, 1721, 316, 3236, 204, 4832, 4482, 1721, 23, 27859, 1721, 821, 5314, 1755, 1721, 21458, 3236, 195, 4832, 2266, 1721, 28, 40049, 1721, 1855, 1243, 1721, 28325, 3236, 231, 4832, 1125, 1721, 784, 24639, 1721, 1717, 330, 1721, 28325, 2]}
-        # fmt: on
-
-        self.assertListEqual(encoding.input_ids, expected_results["input_ids"])
-
-    def test_tokenizer_as_target(self):
-        # by default the tokenizer do_lower_case
-        tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base")
-        answer_text = "tapex is a good model!"
-        expected_src_tokens = [0, 90, 5776, 1178, 16, 10, 205, 1421, 328, 2]
-        answer_encoding = tokenizer(answer=answer_text)
-        self.assertListEqual(answer_encoding.input_ids, expected_src_tokens)
-
-    @slow
-    def test_tokenizer_lower_case(self):
-        cased_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base", do_lower_case=False)
-        uncased_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base", do_lower_case=True)
-        answer_text = "Beijing, London, Paris"
-        answer_text_lower = "beijing, london, paris"
-
-        self.assertNotEqual(
-            cased_tokenizer(answer=answer_text).input_ids, uncased_tokenizer(answer=answer_text).input_ids
-        )
-        self.assertEqual(
-            cased_tokenizer(answer=answer_text_lower).input_ids,
-            uncased_tokenizer(answer=answer_text).input_ids,
-        )
-        # batched encoding assert
-        self.assertNotEqual(
-            cased_tokenizer(answer=[answer_text]).input_ids, uncased_tokenizer(answer=[answer_text]).input_ids
-        )
-        self.assertEqual(
-            cased_tokenizer(answer=[answer_text_lower]).input_ids,
-            uncased_tokenizer(answer=[answer_text]).input_ids,
-        )
-        # test input encoding lowercase
-        question = "Greece held its last Summer Olympics in 2004"
-        table_dict = {
-            "header": ["Year", "City", "Country", "Nations"],
-            "rows": [
-                [1896, "Athens", "Greece", 14],
-                [1900, "Paris", "France", 24],
-                [1904, "St. Louis", "USA", 12],
-                [2004, "Athens", "Greece", 201],
-                [2008, "Beijing", "China", 204],
-                [2012, "London", "UK", 204],
-            ],
-        }
-        table = pd.DataFrame.from_dict(table_dict["rows"])
-        table.columns = table_dict["header"]
-
-        self.assertNotEqual(
-            cased_tokenizer(table=table, query=question).input_ids,
-            uncased_tokenizer(table=table, query=question).input_ids,
-        )
diff --git a/tests/models/trajectory_transformer/__init__.py b/tests/models/trajectory_transformer/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py b/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py
deleted file mode 100644
index cf553bdf75..0000000000
--- a/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py
+++ /dev/null
@@ -1,276 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch TrajectoryTransformer model. """
-
-
-import inspect
-import unittest
-
-import numpy as np
-
-from transformers import TrajectoryTransformerConfig, is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import TrajectoryTransformerModel
-    from transformers.models.trajectory_transformer.modeling_trajectory_transformer import (
-        TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-
-
-class TrajectoryTransformerModelTester:
-    def __init__(self, parent, batch_size=13, n_embd=128, action_dim=6, observation_dim=17, is_training=True):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.n_embd = n_embd
-        self.action_dim = action_dim
-        self.observation_dim = observation_dim
-        self.is_training = is_training
-        self.seq_length = self.action_dim + self.observation_dim + 1
-
-    def prepare_config_and_inputs(self):
-        trajectories = torch.LongTensor([np.random.permutation(self.seq_length) for _ in range(self.batch_size)]).to(
-            torch_device
-        )
-        attention_mask = random_attention_mask((self.batch_size, self.seq_length)).to(torch_device)
-        targets = torch.LongTensor([np.random.permutation(self.seq_length) for _ in range(self.batch_size)]).to(
-            torch_device
-        )
-
-        config = self.get_config()
-        return config, trajectories, attention_mask, targets
-
-    def get_config(self):
-        return TrajectoryTransformerConfig(
-            batch_size=self.batch_size,
-            n_embd=self.n_embd,
-            action_dim=self.action_dim,
-            observation_dim=self.observation_dim,
-        )
-
-    def create_and_check_model(self, config, input_dict):
-        model = TrajectoryTransformerModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(trajectories=input_dict["trajectories"], attention_mask=input_dict["attention_mask"])
-        result = model(
-            trajectories=input_dict["trajectories"],
-            output_hidden_states=True,
-            output_attentions=True,
-            use_cache=True,
-            return_dict=True,
-        )
-
-        self.parent.assertEqual(result.hidden_states[-1].shape, (self.batch_size, self.seq_length, self.n_embd))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, trajectories, attention_mask, targets) = config_and_inputs
-        inputs_dict = {"trajectories": trajectories, "attention_mask": attention_mask, "targets": targets}
-        return config, inputs_dict
-
-
-@require_torch
-class TrajectoryTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TrajectoryTransformerModel,) if is_torch_available() else ()
-    pipeline_model_mapping = {"feature-extraction": TrajectoryTransformerModel} if is_torch_available() else {}
-
-    # Ignoring of a failing test from GenerationTesterMixin, as the model does not use inputs_ids
-    test_generate_without_input_ids = False
-
-    # Ignoring of a failing tests from ModelTesterMixin, as the model does not implement these features
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_attention_outputs = False
-    test_hidden_states_output = False
-    test_inputs_embeds = False
-    test_model_common_attributes = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = TrajectoryTransformerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=TrajectoryTransformerConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_conditional_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["trajectories"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    # # Input is 'trajectories' not 'input_ids'
-    def test_model_main_input_name(self):
-        model_signature = inspect.signature(getattr(TrajectoryTransformerModel, "forward"))
-        # The main input is the name of the argument after `self`
-        observed_main_input_name = list(model_signature.parameters.keys())[1]
-        self.assertEqual(TrajectoryTransformerModel.main_input_name, observed_main_input_name)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = self.has_attentions
-
-        model = TrajectoryTransformerModel(config)
-        model.to(torch_device)
-
-        outputs = model(
-            trajectories=input_dict["trajectories"],
-            attention_mask=input_dict["attention_mask"],
-            targets=input_dict["targets"],
-            output_hidden_states=True,
-            output_attentions=True,
-            use_cache=True,
-            return_dict=True,
-        )
-
-        output = outputs[0]
-        hidden_states = outputs.hidden_states[0]
-        hidden_states.retain_grad()
-
-        if self.has_attentions:
-            attentions = outputs.attentions[0]
-            attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-
-        if self.has_attentions:
-            self.assertIsNotNone(attentions.grad)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        model = TrajectoryTransformerModel(config)
-        model.to(torch_device)
-        model.train()
-        loss = model(
-            trajectories=input_dict["trajectories"],
-            attention_mask=input_dict["attention_mask"],
-            targets=input_dict["targets"],
-            output_hidden_states=True,
-            output_attentions=True,
-            use_cache=True,
-            return_dict=True,
-        ).loss
-        loss.backward()
-
-    def test_training_gradient_checkpointing(self):
-        if not self.model_tester.is_training:
-            return
-
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        model = TrajectoryTransformerModel(config)
-        model.gradient_checkpointing_enable()
-        model.to(torch_device)
-        model.train()
-        loss = model(
-            trajectories=input_dict["trajectories"],
-            attention_mask=input_dict["attention_mask"],
-            targets=input_dict["targets"],
-            output_hidden_states=True,
-            output_attentions=True,
-            use_cache=False,
-            return_dict=True,
-        ).loss
-        loss.backward()
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = TrajectoryTransformerModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-@require_torch
-class TrajectoryTransformerModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_prediction(self):
-        batch_size = 1
-
-        config = TrajectoryTransformerConfig.from_pretrained("CarlCochet/trajectory-transformer-halfcheetah-medium-v2")
-        model = TrajectoryTransformerModel.from_pretrained(
-            "CarlCochet/trajectory-transformer-halfcheetah-medium-v2", config=config
-        )
-        model.to(torch_device)
-        model.eval()
-
-        seq_length = model.config.action_dim + model.config.observation_dim + 1
-
-        trajectories = torch.LongTensor(
-            [[3, 19, 20, 22, 9, 7, 23, 10, 18, 14, 13, 4, 17, 11, 5, 6, 15, 21, 2, 8, 1, 0, 12, 16]]
-        ).to(torch_device)
-        outputs = model(
-            trajectories=trajectories,
-            output_hidden_states=True,
-            output_attentions=True,
-            use_cache=True,
-            return_dict=True,
-        )
-
-        output = outputs.logits
-
-        expected_shape = torch.Size((batch_size, seq_length, model.config.vocab_size + 1))
-        expected_slice = torch.tensor(
-            [[[-0.7193, -0.2532, -0.0898], [1.9429, 2.0434, 2.3975], [-3.3651, -2.8744, -2.4532]]]
-        ).to(torch_device)
-        output_slice = output[:, :3, :3]
-
-        self.assertEqual(output.shape, expected_shape)
-        self.assertTrue(torch.allclose(output_slice, expected_slice, atol=1e-4))
diff --git a/tests/models/van/__init__.py b/tests/models/van/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/models/van/test_modeling_van.py b/tests/models/van/test_modeling_van.py
deleted file mode 100644
index 1296e0a225..0000000000
--- a/tests/models/van/test_modeling_van.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Van model. """
-
-
-import inspect
-import math
-import unittest
-
-from transformers import VanConfig
-from transformers.testing_utils import require_scipy, require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property, is_scipy_available, is_torch_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_scipy_available():
-    from scipy import stats
-
-if is_torch_available():
-    import torch
-    from torch import nn
-
-    from transformers import VanForImageClassification, VanModel
-    from transformers.models.van.modeling_van import VAN_PRETRAINED_MODEL_ARCHIVE_LIST
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import AutoImageProcessor
-
-
-class VanModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        image_size=224,
-        num_channels=3,
-        hidden_sizes=[16, 32, 64, 128],
-        depths=[1, 1, 1, 1],
-        is_training=True,
-        use_labels=True,
-        num_labels=3,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.hidden_sizes = hidden_sizes
-        self.depths = depths
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.num_labels = num_labels
-        self.type_sequence_label_size = num_labels
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return VanConfig(
-            num_channels=self.num_channels,
-            hidden_sizes=self.hidden_sizes,
-            depths=self.depths,
-            num_labels=self.num_labels,
-            is_decoder=False,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = VanModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-        # expected last hidden states: B, C, H // 32, W // 32
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        model = VanForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_torch
-class VanModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as Van does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (VanModel, VanForImageClassification) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": VanModel, "image-classification": VanForImageClassification}
-        if is_torch_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = VanModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=VanConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    @unittest.skip(reason="Van does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Van does not support input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @require_scipy
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        configs_no_init = _config_zero_init(config)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, module in model.named_modules():
-                if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm, nn.LayerNorm)):
-                    self.assertTrue(
-                        torch.all(module.weight == 1),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-                    self.assertTrue(
-                        torch.all(module.bias == 0),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-                elif isinstance(module, nn.Conv2d):
-                    fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels
-                    fan_out //= module.groups
-                    std = math.sqrt(2.0 / fan_out)
-                    # divide by std -> mean = 0, std = 1
-                    data = module.weight.data.cpu().flatten().numpy() / std
-                    test = stats.anderson(data)
-                    self.assertTrue(test.statistic > 0.05)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_stages = len(self.model_tester.hidden_sizes)
-            # van has no embeddings
-            self.assertEqual(len(hidden_states), expected_num_stages)
-
-            # Van's feature maps are of shape (batch_size, num_channels, height, width)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in VAN_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = VanModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_torch
-@require_vision
-class VanModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0])
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = VanForImageClassification.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = torch.Size((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = torch.tensor([0.1029, -0.0904, -0.6365]).to(torch_device)
-
-        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index 1175f07740..ac68337ea2 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -69,10 +69,6 @@ SPECIAL_CASES_TO_ALLOW = {
     "CvtConfig": ["layer_norm_eps"],
     # having default values other than `1e-5` - we can't fix them without breaking
     "PerceiverConfig": ["layer_norm_eps"],
-    # having default values other than `1e-5` - we can't fix them without breaking
-    "RetriBertConfig": ["layer_norm_eps"],
-    # having default values other than `1e-5` - we can't fix them without breaking
-    "TrajectoryTransformerConfig": ["layer_norm_eps"],
     # used internally to calculate the feature size
     "InformerConfig": ["num_static_real_features", "num_time_features"],
     # used internally to calculate the feature size
@@ -106,7 +102,6 @@ SPECIAL_CASES_TO_ALLOW.update(
         "OneFormerConfig": True,
         "PerceiverConfig": True,
         "RagConfig": True,
-        "RetriBertConfig": True,
         "SpeechT5Config": True,
         "SwinConfig": True,
         "Swin2SRConfig": True,
@@ -114,11 +109,9 @@ SPECIAL_CASES_TO_ALLOW.update(
         "SwitchTransformersConfig": True,
         "TableTransformerConfig": True,
         "TapasConfig": True,
-        "TrajectoryTransformerConfig": True,
         "TransfoXLConfig": True,
         "UniSpeechConfig": True,
         "UniSpeechSatConfig": True,
-        "VanConfig": True,
         "WavLMConfig": True,
         "WhisperConfig": True,
         # TODO: @Arthur (for `alignment_head` and `alignment_layer`)
@@ -267,6 +260,9 @@ def check_config_attributes():
     """Check the arguments in `__init__` of all configuration classes are used in  python files"""
     configs_with_unused_attributes = {}
     for _config_class in list(CONFIG_MAPPING.values()):
+        # Skip deprecated models
+        if "models.deprecated" in _config_class.__module__:
+            continue
         # Some config classes are not in `CONFIG_MAPPING` (e.g. `CLIPVisionConfig`, `Blip2VisionConfig`, etc.)
         config_classes_in_module = [
             cls
diff --git a/utils/check_config_docstrings.py b/utils/check_config_docstrings.py
index de47348a9e..b92a2e559d 100644
--- a/utils/check_config_docstrings.py
+++ b/utils/check_config_docstrings.py
@@ -74,6 +74,9 @@ def check_config_docstrings_have_checkpoints():
     configs_without_checkpoint = []
 
     for config_class in list(CONFIG_MAPPING.values()):
+        # Skip deprecated models
+        if "models.deprecated" in config_class.__module__:
+            continue
         checkpoint = get_checkpoint_from_config_class(config_class)
 
         name = config_class.__name__
diff --git a/utils/check_repo.py b/utils/check_repo.py
index c6e5e39374..b39bc8489e 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -400,6 +400,8 @@ def check_model_list():
     models_dir = os.path.join(PATH_TO_TRANSFORMERS, "models")
     _models = []
     for model in os.listdir(models_dir):
+        if model == "deprecated":
+            continue
         model_dir = os.path.join(models_dir, model)
         if os.path.isdir(model_dir) and "__init__.py" in os.listdir(model_dir):
             _models.append(model)
@@ -445,6 +447,8 @@ def get_model_modules():
     ]
     modules = []
     for model in dir(transformers.models):
+        if model == "deprecated":
+            continue
         # There are some magic dunder attributes in the dir, we ignore them
         if not model.startswith("__"):
             model_module = getattr(transformers.models, model)
@@ -767,6 +771,8 @@ def check_objects_being_equally_in_main_init():
         obj = getattr(transformers, attr)
         if hasattr(obj, "__module__"):
             module_path = obj.__module__
+            if "models.deprecated" in module_path:
+                continue
             module_name = module_path.split(".")[-1]
             module_dir = ".".join(module_path.split(".")[:-1])
             if (
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 73fcd37106..3c8d9ef4ea 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -277,9 +277,6 @@ src/transformers/models/mbart/tokenization_mbart.py
 src/transformers/models/mbart/tokenization_mbart_fast.py
 src/transformers/models/mbart50/tokenization_mbart50.py
 src/transformers/models/mbart50/tokenization_mbart50_fast.py
-src/transformers/models/mctct/configuration_mctct.py
-src/transformers/models/mctct/feature_extraction_mctct.py
-src/transformers/models/mctct/processing_mctct.py
 src/transformers/models/megatron_bert/configuration_megatron_bert.py
 src/transformers/models/mgp_str/processing_mgp_str.py
 src/transformers/models/mgp_str/tokenization_mgp_str.py
@@ -362,8 +359,6 @@ src/transformers/models/rembert/tokenization_rembert_fast.py
 src/transformers/models/resnet/configuration_resnet.py
 src/transformers/models/resnet/modeling_resnet.py
 src/transformers/models/resnet/modeling_tf_resnet.py
-src/transformers/models/retribert/tokenization_retribert.py
-src/transformers/models/retribert/tokenization_retribert_fast.py
 src/transformers/models/roberta/configuration_roberta.py
 src/transformers/models/roberta/modeling_roberta.py
 src/transformers/models/roberta/modeling_tf_roberta.py
@@ -413,12 +408,10 @@ src/transformers/models/t5/tokenization_t5.py
 src/transformers/models/t5/tokenization_t5_fast.py
 src/transformers/models/table_transformer/modeling_table_transformer.py
 src/transformers/models/tapas/tokenization_tapas.py
-src/transformers/models/tapex/tokenization_tapex.py
 src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
 src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
 src/transformers/models/timesformer/configuration_timesformer.py
 src/transformers/models/timesformer/modeling_timesformer.py
-src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py
 src/transformers/models/transfo_xl/configuration_transfo_xl.py
 src/transformers/models/transfo_xl/tokenization_transfo_xl.py
 src/transformers/models/trocr/configuration_trocr.py
@@ -431,7 +424,6 @@ src/transformers/models/unispeech/configuration_unispeech.py
 src/transformers/models/unispeech/modeling_unispeech.py
 src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
 src/transformers/models/upernet/modeling_upernet.py
-src/transformers/models/van/modeling_van.py
 src/transformers/models/videomae/feature_extraction_videomae.py
 src/transformers/models/videomae/image_processing_videomae.py
 src/transformers/models/videomae/modeling_videomae.py