From 372f50030bd422309d7b859d730842d7755d3371 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 15 Jun 2023 07:30:24 -0400
Subject: [PATCH] Split common test from core tests (#24284)

---
 tests/test_configuration_common.py      | 270 +------
 tests/test_configuration_utils.py       | 286 +++++++
 tests/test_feature_extraction_common.py | 127 +--
 tests/test_feature_extraction_utils.py  | 144 ++++
 tests/test_image_processing_common.py   | 147 +---
 tests/test_image_processing_utils.py    | 154 ++++
 tests/test_modeling_common.py           | 934 +----------------------
 tests/test_modeling_flax_common.py      | 175 +----
 tests/test_modeling_flax_utils.py       | 186 +++++
 tests/test_modeling_tf_common.py        | 575 +-------------
 tests/test_modeling_tf_utils.py         | 627 +++++++++++++++
 tests/test_modeling_utils.py            | 976 ++++++++++++++++++++++++
 tests/test_tokenization_common.py       | 258 +------
 tests/test_tokenization_utils.py        | 280 +++++++
 14 files changed, 2662 insertions(+), 2477 deletions(-)
 create mode 100644 tests/test_configuration_utils.py
 create mode 100644 tests/test_feature_extraction_utils.py
 create mode 100644 tests/test_image_processing_utils.py
 create mode 100644 tests/test_modeling_flax_utils.py
 create mode 100644 tests/test_modeling_tf_utils.py
 create mode 100755 tests/test_modeling_utils.py
 create mode 100644 tests/test_tokenization_utils.py

diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index fdb679529d..18d9a76e97 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -16,80 +16,11 @@
 import copy
 import json
 import os
-import shutil
-import sys
 import tempfile
-import unittest
-import unittest.mock as mock
-from pathlib import Path
 
-from huggingface_hub import HfFolder, delete_repo
-from requests.exceptions import HTTPError
+from transformers import is_torch_available
 
-from transformers import AutoConfig, BertConfig, GPT2Config, is_torch_available
-from transformers.configuration_utils import PretrainedConfig
-from transformers.testing_utils import TOKEN, USER, is_staging_test
-
-
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
-
-from test_module.custom_configuration import CustomConfig  # noqa E402
-
-
-config_common_kwargs = {
-    "return_dict": False,
-    "output_hidden_states": True,
-    "output_attentions": True,
-    "torchscript": True,
-    "torch_dtype": "float16",
-    "use_bfloat16": True,
-    "tf_legacy_loss": True,
-    "pruned_heads": {"a": 1},
-    "tie_word_embeddings": False,
-    "is_decoder": True,
-    "cross_attention_hidden_size": 128,
-    "add_cross_attention": True,
-    "tie_encoder_decoder": True,
-    "max_length": 50,
-    "min_length": 3,
-    "do_sample": True,
-    "early_stopping": True,
-    "num_beams": 3,
-    "num_beam_groups": 3,
-    "diversity_penalty": 0.5,
-    "temperature": 2.0,
-    "top_k": 10,
-    "top_p": 0.7,
-    "typical_p": 0.2,
-    "repetition_penalty": 0.8,
-    "length_penalty": 0.8,
-    "no_repeat_ngram_size": 5,
-    "encoder_no_repeat_ngram_size": 5,
-    "bad_words_ids": [1, 2, 3],
-    "num_return_sequences": 3,
-    "chunk_size_feed_forward": 5,
-    "output_scores": True,
-    "return_dict_in_generate": True,
-    "forced_bos_token_id": 2,
-    "forced_eos_token_id": 3,
-    "remove_invalid_values": True,
-    "architectures": ["BertModel"],
-    "finetuning_task": "translation",
-    "id2label": {0: "label"},
-    "label2id": {"label": "0"},
-    "tokenizer_class": "BertTokenizerFast",
-    "prefix": "prefix",
-    "bos_token_id": 6,
-    "pad_token_id": 7,
-    "eos_token_id": 8,
-    "sep_token_id": 9,
-    "decoder_start_token_id": 10,
-    "exponential_decay_length_penalty": (5, 1.01),
-    "suppress_tokens": [0, 1],
-    "begin_suppress_tokens": 2,
-    "task_specific_params": {"translation": "some_params"},
-    "problem_type": "regression",
-}
+from .test_configuration_utils import config_common_kwargs
 
 
 class ConfigTester(object):
@@ -220,200 +151,3 @@ class ConfigTester(object):
         self.create_and_test_config_with_num_labels()
         self.check_config_can_be_init_without_params()
         self.check_config_arguments_init()
-
-
-@is_staging_test
-class ConfigPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-config")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-config-org")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-dynamic-config")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        config.push_to_hub("test-config", use_auth_token=self._token)
-
-        new_config = BertConfig.from_pretrained(f"{USER}/test-config")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-config")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config.save_pretrained(tmp_dir, repo_id="test-config", push_to_hub=True, use_auth_token=self._token)
-
-        new_config = BertConfig.from_pretrained(f"{USER}/test-config")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-    def test_push_to_hub_in_organization(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        config.push_to_hub("valid_org/test-config-org", use_auth_token=self._token)
-
-        new_config = BertConfig.from_pretrained("valid_org/test-config-org")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-config-org")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config.save_pretrained(
-                tmp_dir, repo_id="valid_org/test-config-org", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_config = BertConfig.from_pretrained("valid_org/test-config-org")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-    def test_push_to_hub_dynamic_config(self):
-        CustomConfig.register_for_auto_class()
-        config = CustomConfig(attribute=42)
-
-        config.push_to_hub("test-dynamic-config", use_auth_token=self._token)
-
-        # This has added the proper auto_map field to the config
-        self.assertDictEqual(config.auto_map, {"AutoConfig": "custom_configuration.CustomConfig"})
-
-        new_config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-config", trust_remote_code=True)
-        # Can't make an isinstance check because the new_config is from the FakeConfig class of a dynamic module
-        self.assertEqual(new_config.__class__.__name__, "CustomConfig")
-        self.assertEqual(new_config.attribute, 42)
-
-
-class ConfigTestUtils(unittest.TestCase):
-    def test_config_from_string(self):
-        c = GPT2Config()
-
-        # attempt to modify each of int/float/bool/str config records and verify they were updated
-        n_embd = c.n_embd + 1  # int
-        resid_pdrop = c.resid_pdrop + 1.0  # float
-        scale_attn_weights = not c.scale_attn_weights  # bool
-        summary_type = c.summary_type + "foo"  # str
-        c.update_from_string(
-            f"n_embd={n_embd},resid_pdrop={resid_pdrop},scale_attn_weights={scale_attn_weights},summary_type={summary_type}"
-        )
-        self.assertEqual(n_embd, c.n_embd, "mismatch for key: n_embd")
-        self.assertEqual(resid_pdrop, c.resid_pdrop, "mismatch for key: resid_pdrop")
-        self.assertEqual(scale_attn_weights, c.scale_attn_weights, "mismatch for key: scale_attn_weights")
-        self.assertEqual(summary_type, c.summary_type, "mismatch for key: summary_type")
-
-    def test_config_common_kwargs_is_complete(self):
-        base_config = PretrainedConfig()
-        missing_keys = [key for key in base_config.__dict__ if key not in config_common_kwargs]
-        # If this part of the test fails, you have arguments to addin config_common_kwargs above.
-        self.assertListEqual(
-            missing_keys, ["is_encoder_decoder", "_name_or_path", "_commit_hash", "transformers_version"]
-        )
-        keys_with_defaults = [key for key, value in config_common_kwargs.items() if value == getattr(base_config, key)]
-        if len(keys_with_defaults) > 0:
-            raise ValueError(
-                "The following keys are set with the default values in"
-                " `test_configuration_common.config_common_kwargs` pick another value for them:"
-                f" {', '.join(keys_with_defaults)}."
-            )
-
-    def test_from_pretrained_subfolder(self):
-        with self.assertRaises(OSError):
-            # config is in subfolder, the following should not work without specifying the subfolder
-            _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert-subfolder")
-
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert-subfolder", subfolder="bert")
-
-        self.assertIsNotNone(config)
-
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_legacy_load_from_url(self):
-        # This test is for deprecated behavior and can be removed in v5
-        _ = BertConfig.from_pretrained(
-            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/config.json"
-        )
-
-
-class ConfigurationVersioningTest(unittest.TestCase):
-    def test_local_versioning(self):
-        configuration = AutoConfig.from_pretrained("bert-base-cased")
-        configuration.configuration_files = ["config.4.0.0.json"]
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            configuration.save_pretrained(tmp_dir)
-            configuration.hidden_size = 2
-            json.dump(configuration.to_dict(), open(os.path.join(tmp_dir, "config.4.0.0.json"), "w"))
-
-            # This should pick the new configuration file as the version of Transformers is > 4.0.0
-            new_configuration = AutoConfig.from_pretrained(tmp_dir)
-            self.assertEqual(new_configuration.hidden_size, 2)
-
-            # Will need to be adjusted if we reach v42 and this test is still here.
-            # Should pick the old configuration file as the version of Transformers is < 4.42.0
-            configuration.configuration_files = ["config.42.0.0.json"]
-            configuration.hidden_size = 768
-            configuration.save_pretrained(tmp_dir)
-            shutil.move(os.path.join(tmp_dir, "config.4.0.0.json"), os.path.join(tmp_dir, "config.42.0.0.json"))
-            new_configuration = AutoConfig.from_pretrained(tmp_dir)
-            self.assertEqual(new_configuration.hidden_size, 768)
-
-    def test_repo_versioning_before(self):
-        # This repo has two configuration files, one for v4.0.0 and above with a different hidden size.
-        repo = "hf-internal-testing/test-two-configs"
-
-        import transformers as new_transformers
-
-        new_transformers.configuration_utils.__version__ = "v4.0.0"
-        new_configuration, kwargs = new_transformers.models.auto.AutoConfig.from_pretrained(
-            repo, return_unused_kwargs=True
-        )
-        self.assertEqual(new_configuration.hidden_size, 2)
-        # This checks `_configuration_file` ia not kept in the kwargs by mistake.
-        self.assertDictEqual(kwargs, {})
-
-        # Testing an older version by monkey-patching the version in the module it's used.
-        import transformers as old_transformers
-
-        old_transformers.configuration_utils.__version__ = "v3.0.0"
-        old_configuration = old_transformers.models.auto.AutoConfig.from_pretrained(repo)
-        self.assertEqual(old_configuration.hidden_size, 768)
diff --git a/tests/test_configuration_utils.py b/tests/test_configuration_utils.py
new file mode 100644
index 0000000000..fe278cc2b1
--- /dev/null
+++ b/tests/test_configuration_utils.py
@@ -0,0 +1,286 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+import sys
+import tempfile
+import unittest
+import unittest.mock as mock
+from pathlib import Path
+
+from huggingface_hub import HfFolder, delete_repo
+from requests.exceptions import HTTPError
+
+from transformers import AutoConfig, BertConfig, GPT2Config
+from transformers.configuration_utils import PretrainedConfig
+from transformers.testing_utils import TOKEN, USER, is_staging_test
+
+
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_configuration import CustomConfig  # noqa E402
+
+
+config_common_kwargs = {
+    "return_dict": False,
+    "output_hidden_states": True,
+    "output_attentions": True,
+    "torchscript": True,
+    "torch_dtype": "float16",
+    "use_bfloat16": True,
+    "tf_legacy_loss": True,
+    "pruned_heads": {"a": 1},
+    "tie_word_embeddings": False,
+    "is_decoder": True,
+    "cross_attention_hidden_size": 128,
+    "add_cross_attention": True,
+    "tie_encoder_decoder": True,
+    "max_length": 50,
+    "min_length": 3,
+    "do_sample": True,
+    "early_stopping": True,
+    "num_beams": 3,
+    "num_beam_groups": 3,
+    "diversity_penalty": 0.5,
+    "temperature": 2.0,
+    "top_k": 10,
+    "top_p": 0.7,
+    "typical_p": 0.2,
+    "repetition_penalty": 0.8,
+    "length_penalty": 0.8,
+    "no_repeat_ngram_size": 5,
+    "encoder_no_repeat_ngram_size": 5,
+    "bad_words_ids": [1, 2, 3],
+    "num_return_sequences": 3,
+    "chunk_size_feed_forward": 5,
+    "output_scores": True,
+    "return_dict_in_generate": True,
+    "forced_bos_token_id": 2,
+    "forced_eos_token_id": 3,
+    "remove_invalid_values": True,
+    "architectures": ["BertModel"],
+    "finetuning_task": "translation",
+    "id2label": {0: "label"},
+    "label2id": {"label": "0"},
+    "tokenizer_class": "BertTokenizerFast",
+    "prefix": "prefix",
+    "bos_token_id": 6,
+    "pad_token_id": 7,
+    "eos_token_id": 8,
+    "sep_token_id": 9,
+    "decoder_start_token_id": 10,
+    "exponential_decay_length_penalty": (5, 1.01),
+    "suppress_tokens": [0, 1],
+    "begin_suppress_tokens": 2,
+    "task_specific_params": {"translation": "some_params"},
+    "problem_type": "regression",
+}
+
+
+@is_staging_test
+class ConfigPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-config")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-config-org")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-dynamic-config")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        config.push_to_hub("test-config", use_auth_token=self._token)
+
+        new_config = BertConfig.from_pretrained(f"{USER}/test-config")
+        for k, v in config.to_dict().items():
+            if k != "transformers_version":
+                self.assertEqual(v, getattr(new_config, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-config")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config.save_pretrained(tmp_dir, repo_id="test-config", push_to_hub=True, use_auth_token=self._token)
+
+        new_config = BertConfig.from_pretrained(f"{USER}/test-config")
+        for k, v in config.to_dict().items():
+            if k != "transformers_version":
+                self.assertEqual(v, getattr(new_config, k))
+
+    def test_push_to_hub_in_organization(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        config.push_to_hub("valid_org/test-config-org", use_auth_token=self._token)
+
+        new_config = BertConfig.from_pretrained("valid_org/test-config-org")
+        for k, v in config.to_dict().items():
+            if k != "transformers_version":
+                self.assertEqual(v, getattr(new_config, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-config-org")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config.save_pretrained(
+                tmp_dir, repo_id="valid_org/test-config-org", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_config = BertConfig.from_pretrained("valid_org/test-config-org")
+        for k, v in config.to_dict().items():
+            if k != "transformers_version":
+                self.assertEqual(v, getattr(new_config, k))
+
+    def test_push_to_hub_dynamic_config(self):
+        CustomConfig.register_for_auto_class()
+        config = CustomConfig(attribute=42)
+
+        config.push_to_hub("test-dynamic-config", use_auth_token=self._token)
+
+        # This has added the proper auto_map field to the config
+        self.assertDictEqual(config.auto_map, {"AutoConfig": "custom_configuration.CustomConfig"})
+
+        new_config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-config", trust_remote_code=True)
+        # Can't make an isinstance check because the new_config is from the FakeConfig class of a dynamic module
+        self.assertEqual(new_config.__class__.__name__, "CustomConfig")
+        self.assertEqual(new_config.attribute, 42)
+
+
+class ConfigTestUtils(unittest.TestCase):
+    def test_config_from_string(self):
+        c = GPT2Config()
+
+        # attempt to modify each of int/float/bool/str config records and verify they were updated
+        n_embd = c.n_embd + 1  # int
+        resid_pdrop = c.resid_pdrop + 1.0  # float
+        scale_attn_weights = not c.scale_attn_weights  # bool
+        summary_type = c.summary_type + "foo"  # str
+        c.update_from_string(
+            f"n_embd={n_embd},resid_pdrop={resid_pdrop},scale_attn_weights={scale_attn_weights},summary_type={summary_type}"
+        )
+        self.assertEqual(n_embd, c.n_embd, "mismatch for key: n_embd")
+        self.assertEqual(resid_pdrop, c.resid_pdrop, "mismatch for key: resid_pdrop")
+        self.assertEqual(scale_attn_weights, c.scale_attn_weights, "mismatch for key: scale_attn_weights")
+        self.assertEqual(summary_type, c.summary_type, "mismatch for key: summary_type")
+
+    def test_config_common_kwargs_is_complete(self):
+        base_config = PretrainedConfig()
+        missing_keys = [key for key in base_config.__dict__ if key not in config_common_kwargs]
+        # If this part of the test fails, you have arguments to addin config_common_kwargs above.
+        self.assertListEqual(
+            missing_keys, ["is_encoder_decoder", "_name_or_path", "_commit_hash", "transformers_version"]
+        )
+        keys_with_defaults = [key for key, value in config_common_kwargs.items() if value == getattr(base_config, key)]
+        if len(keys_with_defaults) > 0:
+            raise ValueError(
+                "The following keys are set with the default values in"
+                " `test_configuration_common.config_common_kwargs` pick another value for them:"
+                f" {', '.join(keys_with_defaults)}."
+            )
+
+    def test_from_pretrained_subfolder(self):
+        with self.assertRaises(OSError):
+            # config is in subfolder, the following should not work without specifying the subfolder
+            _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert-subfolder")
+
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert-subfolder", subfolder="bert")
+
+        self.assertIsNotNone(config)
+
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_legacy_load_from_url(self):
+        # This test is for deprecated behavior and can be removed in v5
+        _ = BertConfig.from_pretrained(
+            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/config.json"
+        )
+
+    def test_local_versioning(self):
+        configuration = AutoConfig.from_pretrained("bert-base-cased")
+        configuration.configuration_files = ["config.4.0.0.json"]
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            configuration.save_pretrained(tmp_dir)
+            configuration.hidden_size = 2
+            json.dump(configuration.to_dict(), open(os.path.join(tmp_dir, "config.4.0.0.json"), "w"))
+
+            # This should pick the new configuration file as the version of Transformers is > 4.0.0
+            new_configuration = AutoConfig.from_pretrained(tmp_dir)
+            self.assertEqual(new_configuration.hidden_size, 2)
+
+            # Will need to be adjusted if we reach v42 and this test is still here.
+            # Should pick the old configuration file as the version of Transformers is < 4.42.0
+            configuration.configuration_files = ["config.42.0.0.json"]
+            configuration.hidden_size = 768
+            configuration.save_pretrained(tmp_dir)
+            shutil.move(os.path.join(tmp_dir, "config.4.0.0.json"), os.path.join(tmp_dir, "config.42.0.0.json"))
+            new_configuration = AutoConfig.from_pretrained(tmp_dir)
+            self.assertEqual(new_configuration.hidden_size, 768)
+
+    def test_repo_versioning_before(self):
+        # This repo has two configuration files, one for v4.0.0 and above with a different hidden size.
+        repo = "hf-internal-testing/test-two-configs"
+
+        import transformers as new_transformers
+
+        new_transformers.configuration_utils.__version__ = "v4.0.0"
+        new_configuration, kwargs = new_transformers.models.auto.AutoConfig.from_pretrained(
+            repo, return_unused_kwargs=True
+        )
+        self.assertEqual(new_configuration.hidden_size, 2)
+        # This checks `_configuration_file` ia not kept in the kwargs by mistake.
+        self.assertDictEqual(kwargs, {})
+
+        # Testing an older version by monkey-patching the version in the module it's used.
+        import transformers as old_transformers
+
+        old_transformers.configuration_utils.__version__ = "v3.0.0"
+        old_configuration = old_transformers.models.auto.AutoConfig.from_pretrained(repo)
+        self.assertEqual(old_configuration.hidden_size, 768)
diff --git a/tests/test_feature_extraction_common.py b/tests/test_feature_extraction_common.py
index 6ffec66913..49937309d0 100644
--- a/tests/test_feature_extraction_common.py
+++ b/tests/test_feature_extraction_common.py
@@ -16,25 +16,9 @@
 
 import json
 import os
-import sys
 import tempfile
-import unittest
-import unittest.mock as mock
-from pathlib import Path
 
-from huggingface_hub import HfFolder, delete_repo
-from requests.exceptions import HTTPError
-
-from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor
-from transformers.testing_utils import TOKEN, USER, check_json_file_has_correct_format, get_tests_dir, is_staging_test
-
-
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
-
-from test_module.custom_feature_extraction import CustomFeatureExtractor  # noqa E402
-
-
-SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = get_tests_dir("fixtures")
+from transformers.testing_utils import check_json_file_has_correct_format
 
 
 class FeatureExtractionSavingTestMixin:
@@ -69,112 +53,3 @@ class FeatureExtractionSavingTestMixin:
     def test_init_without_params(self):
         feat_extract = self.feature_extraction_class()
         self.assertIsNotNone(feat_extract)
-
-
-class FeatureExtractorUtilTester(unittest.TestCase):
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            _ = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_legacy_load_from_url(self):
-        # This test is for deprecated behavior and can be removed in v5
-        _ = Wav2Vec2FeatureExtractor.from_pretrained(
-            "https://huggingface.co/hf-internal-testing/tiny-random-wav2vec2/resolve/main/preprocessor_config.json"
-        )
-
-
-@is_staging_test
-class FeatureExtractorPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-feature-extractor")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-feature-extractor-org")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-dynamic-feature-extractor")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
-        feature_extractor.push_to_hub("test-feature-extractor", use_auth_token=self._token)
-
-        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"{USER}/test-feature-extractor")
-        for k, v in feature_extractor.__dict__.items():
-            self.assertEqual(v, getattr(new_feature_extractor, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-feature-extractor")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            feature_extractor.save_pretrained(
-                tmp_dir, repo_id="test-feature-extractor", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"{USER}/test-feature-extractor")
-        for k, v in feature_extractor.__dict__.items():
-            self.assertEqual(v, getattr(new_feature_extractor, k))
-
-    def test_push_to_hub_in_organization(self):
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
-        feature_extractor.push_to_hub("valid_org/test-feature-extractor", use_auth_token=self._token)
-
-        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("valid_org/test-feature-extractor")
-        for k, v in feature_extractor.__dict__.items():
-            self.assertEqual(v, getattr(new_feature_extractor, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-feature-extractor")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            feature_extractor.save_pretrained(
-                tmp_dir, repo_id="valid_org/test-feature-extractor-org", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("valid_org/test-feature-extractor-org")
-        for k, v in feature_extractor.__dict__.items():
-            self.assertEqual(v, getattr(new_feature_extractor, k))
-
-    def test_push_to_hub_dynamic_feature_extractor(self):
-        CustomFeatureExtractor.register_for_auto_class()
-        feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
-
-        feature_extractor.push_to_hub("test-dynamic-feature-extractor", use_auth_token=self._token)
-
-        # This has added the proper auto_map field to the config
-        self.assertDictEqual(
-            feature_extractor.auto_map,
-            {"AutoFeatureExtractor": "custom_feature_extraction.CustomFeatureExtractor"},
-        )
-
-        new_feature_extractor = AutoFeatureExtractor.from_pretrained(
-            f"{USER}/test-dynamic-feature-extractor", trust_remote_code=True
-        )
-        # Can't make an isinstance check because the new_feature_extractor is from the CustomFeatureExtractor class of a dynamic module
-        self.assertEqual(new_feature_extractor.__class__.__name__, "CustomFeatureExtractor")
diff --git a/tests/test_feature_extraction_utils.py b/tests/test_feature_extraction_utils.py
new file mode 100644
index 0000000000..b17c48ff12
--- /dev/null
+++ b/tests/test_feature_extraction_utils.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import sys
+import tempfile
+import unittest
+import unittest.mock as mock
+from pathlib import Path
+
+from huggingface_hub import HfFolder, delete_repo
+from requests.exceptions import HTTPError
+
+from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor
+from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test
+
+
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_feature_extraction import CustomFeatureExtractor  # noqa E402
+
+
+SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = get_tests_dir("fixtures")
+
+
+class FeatureExtractorUtilTester(unittest.TestCase):
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_legacy_load_from_url(self):
+        # This test is for deprecated behavior and can be removed in v5
+        _ = Wav2Vec2FeatureExtractor.from_pretrained(
+            "https://huggingface.co/hf-internal-testing/tiny-random-wav2vec2/resolve/main/preprocessor_config.json"
+        )
+
+
+@is_staging_test
+class FeatureExtractorPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-feature-extractor")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-feature-extractor-org")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-dynamic-feature-extractor")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
+        feature_extractor.push_to_hub("test-feature-extractor", use_auth_token=self._token)
+
+        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"{USER}/test-feature-extractor")
+        for k, v in feature_extractor.__dict__.items():
+            self.assertEqual(v, getattr(new_feature_extractor, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-feature-extractor")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            feature_extractor.save_pretrained(
+                tmp_dir, repo_id="test-feature-extractor", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"{USER}/test-feature-extractor")
+        for k, v in feature_extractor.__dict__.items():
+            self.assertEqual(v, getattr(new_feature_extractor, k))
+
+    def test_push_to_hub_in_organization(self):
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
+        feature_extractor.push_to_hub("valid_org/test-feature-extractor", use_auth_token=self._token)
+
+        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("valid_org/test-feature-extractor")
+        for k, v in feature_extractor.__dict__.items():
+            self.assertEqual(v, getattr(new_feature_extractor, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-feature-extractor")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            feature_extractor.save_pretrained(
+                tmp_dir, repo_id="valid_org/test-feature-extractor-org", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("valid_org/test-feature-extractor-org")
+        for k, v in feature_extractor.__dict__.items():
+            self.assertEqual(v, getattr(new_feature_extractor, k))
+
+    def test_push_to_hub_dynamic_feature_extractor(self):
+        CustomFeatureExtractor.register_for_auto_class()
+        feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
+
+        feature_extractor.push_to_hub("test-dynamic-feature-extractor", use_auth_token=self._token)
+
+        # This has added the proper auto_map field to the config
+        self.assertDictEqual(
+            feature_extractor.auto_map,
+            {"AutoFeatureExtractor": "custom_feature_extraction.CustomFeatureExtractor"},
+        )
+
+        new_feature_extractor = AutoFeatureExtractor.from_pretrained(
+            f"{USER}/test-dynamic-feature-extractor", trust_remote_code=True
+        )
+        # Can't make an isinstance check because the new_feature_extractor is from the CustomFeatureExtractor class of a dynamic module
+        self.assertEqual(new_feature_extractor.__class__.__name__, "CustomFeatureExtractor")
diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py
index 15a6688fbb..166440c4d5 100644
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -13,36 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import json
 import os
-import sys
 import tempfile
-import unittest
-import unittest.mock as mock
-from pathlib import Path
 
-from huggingface_hub import HfFolder, delete_repo
-from requests.exceptions import HTTPError
-
-from transformers import AutoImageProcessor, ViTImageProcessor
-from transformers.testing_utils import (
-    TOKEN,
-    USER,
-    check_json_file_has_correct_format,
-    get_tests_dir,
-    is_staging_test,
-    require_torch,
-    require_vision,
-)
+from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
 
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
-
-from test_module.custom_image_processing import CustomImageProcessor  # noqa E402
-
-
 if is_torch_available():
     import numpy as np
     import torch
@@ -51,9 +29,6 @@ if is_vision_available():
     from PIL import Image
 
 
-SAMPLE_IMAGE_PROCESSING_CONFIG_DIR = get_tests_dir("fixtures")
-
-
 def prepare_image_inputs(image_processor_tester, equal_resolution=False, numpify=False, torchify=False):
     """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
     or a list of PyTorch tensors if one specifies torchify=True.
@@ -201,123 +176,3 @@ class ImageProcessingSavingTestMixin:
             self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
             self.assertEqual(encoding.pixel_values.dtype, torch.float16)
             self.assertEqual(encoding.input_ids.dtype, torch.long)
-
-
-class ImageProcessorUtilTester(unittest.TestCase):
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = ViTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-vit")
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            _ = ViTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-vit")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_legacy_load_from_url(self):
-        # This test is for deprecated behavior and can be removed in v5
-        _ = ViTImageProcessor.from_pretrained(
-            "https://huggingface.co/hf-internal-testing/tiny-random-vit/resolve/main/preprocessor_config.json"
-        )
-
-
-@is_staging_test
-class ImageProcessorPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-image-processor")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-image-processor-org")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-dynamic-image-processor")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
-        image_processor.push_to_hub("test-image-processor", use_auth_token=self._token)
-
-        new_image_processor = ViTImageProcessor.from_pretrained(f"{USER}/test-image-processor")
-        for k, v in image_processor.__dict__.items():
-            self.assertEqual(v, getattr(new_image_processor, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-image-processor")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            image_processor.save_pretrained(
-                tmp_dir, repo_id="test-image-processor", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_image_processor = ViTImageProcessor.from_pretrained(f"{USER}/test-image-processor")
-        for k, v in image_processor.__dict__.items():
-            self.assertEqual(v, getattr(new_image_processor, k))
-
-    def test_push_to_hub_in_organization(self):
-        image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
-        image_processor.push_to_hub("valid_org/test-image-processor", use_auth_token=self._token)
-
-        new_image_processor = ViTImageProcessor.from_pretrained("valid_org/test-image-processor")
-        for k, v in image_processor.__dict__.items():
-            self.assertEqual(v, getattr(new_image_processor, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-image-processor")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            image_processor.save_pretrained(
-                tmp_dir, repo_id="valid_org/test-image-processor-org", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_image_processor = ViTImageProcessor.from_pretrained("valid_org/test-image-processor-org")
-        for k, v in image_processor.__dict__.items():
-            self.assertEqual(v, getattr(new_image_processor, k))
-
-    def test_push_to_hub_dynamic_image_processor(self):
-        CustomImageProcessor.register_for_auto_class()
-        image_processor = CustomImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
-
-        image_processor.push_to_hub("test-dynamic-image-processor", use_auth_token=self._token)
-
-        # This has added the proper auto_map field to the config
-        self.assertDictEqual(
-            image_processor.auto_map,
-            {"ImageProcessor": "custom_image_processing.CustomImageProcessor"},
-        )
-
-        new_image_processor = AutoImageProcessor.from_pretrained(
-            f"{USER}/test-dynamic-image-processor", trust_remote_code=True
-        )
-        # Can't make an isinstance check because the new_image_processor is from the CustomImageProcessor class of a dynamic module
-        self.assertEqual(new_image_processor.__class__.__name__, "CustomImageProcessor")
-
-    def test_image_processor_from_pretrained_subfolder(self):
-        with self.assertRaises(OSError):
-            # config is in subfolder, the following should not work without specifying the subfolder
-            _ = AutoImageProcessor.from_pretrained("hf-internal-testing/stable-diffusion-all-variants")
-
-        config = AutoImageProcessor.from_pretrained(
-            "hf-internal-testing/stable-diffusion-all-variants", subfolder="feature_extractor"
-        )
-
-        self.assertIsNotNone(config)
diff --git a/tests/test_image_processing_utils.py b/tests/test_image_processing_utils.py
new file mode 100644
index 0000000000..d74da63249
--- /dev/null
+++ b/tests/test_image_processing_utils.py
@@ -0,0 +1,154 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import tempfile
+import unittest
+import unittest.mock as mock
+from pathlib import Path
+
+from huggingface_hub import HfFolder, delete_repo
+from requests.exceptions import HTTPError
+
+from transformers import AutoImageProcessor, ViTImageProcessor
+from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test
+
+
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_image_processing import CustomImageProcessor  # noqa E402
+
+
+SAMPLE_IMAGE_PROCESSING_CONFIG_DIR = get_tests_dir("fixtures")
+
+
+class ImageProcessorUtilTester(unittest.TestCase):
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = ViTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-vit")
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = ViTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-vit")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_legacy_load_from_url(self):
+        # This test is for deprecated behavior and can be removed in v5
+        _ = ViTImageProcessor.from_pretrained(
+            "https://huggingface.co/hf-internal-testing/tiny-random-vit/resolve/main/preprocessor_config.json"
+        )
+
+
+@is_staging_test
+class ImageProcessorPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-image-processor")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-image-processor-org")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-dynamic-image-processor")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
+        image_processor.push_to_hub("test-image-processor", use_auth_token=self._token)
+
+        new_image_processor = ViTImageProcessor.from_pretrained(f"{USER}/test-image-processor")
+        for k, v in image_processor.__dict__.items():
+            self.assertEqual(v, getattr(new_image_processor, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-image-processor")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            image_processor.save_pretrained(
+                tmp_dir, repo_id="test-image-processor", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_image_processor = ViTImageProcessor.from_pretrained(f"{USER}/test-image-processor")
+        for k, v in image_processor.__dict__.items():
+            self.assertEqual(v, getattr(new_image_processor, k))
+
+    def test_push_to_hub_in_organization(self):
+        image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
+        image_processor.push_to_hub("valid_org/test-image-processor", use_auth_token=self._token)
+
+        new_image_processor = ViTImageProcessor.from_pretrained("valid_org/test-image-processor")
+        for k, v in image_processor.__dict__.items():
+            self.assertEqual(v, getattr(new_image_processor, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-image-processor")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            image_processor.save_pretrained(
+                tmp_dir, repo_id="valid_org/test-image-processor-org", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_image_processor = ViTImageProcessor.from_pretrained("valid_org/test-image-processor-org")
+        for k, v in image_processor.__dict__.items():
+            self.assertEqual(v, getattr(new_image_processor, k))
+
+    def test_push_to_hub_dynamic_image_processor(self):
+        CustomImageProcessor.register_for_auto_class()
+        image_processor = CustomImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
+
+        image_processor.push_to_hub("test-dynamic-image-processor", use_auth_token=self._token)
+
+        # This has added the proper auto_map field to the config
+        self.assertDictEqual(
+            image_processor.auto_map,
+            {"ImageProcessor": "custom_image_processing.CustomImageProcessor"},
+        )
+
+        new_image_processor = AutoImageProcessor.from_pretrained(
+            f"{USER}/test-dynamic-image-processor", trust_remote_code=True
+        )
+        # Can't make an isinstance check because the new_image_processor is from the CustomImageProcessor class of a dynamic module
+        self.assertEqual(new_image_processor.__class__.__name__, "CustomImageProcessor")
+
+    def test_image_processor_from_pretrained_subfolder(self):
+        with self.assertRaises(OSError):
+            # config is in subfolder, the following should not work without specifying the subfolder
+            _ = AutoImageProcessor.from_pretrained("hf-internal-testing/stable-diffusion-all-variants")
+
+        config = AutoImageProcessor.from_pretrained(
+            "hf-internal-testing/stable-diffusion-all-variants", subfolder="feature_extractor"
+        )
+
+        self.assertIsNotNone(config)
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 1a55aa6b75..07a8b16bfe 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -16,32 +16,22 @@
 import collections
 import copy
 import gc
-import glob
 import inspect
-import json
 import os
 import os.path
 import pickle
 import random
 import re
-import sys
 import tempfile
-import unittest
-import unittest.mock as mock
 import warnings
 from collections import defaultdict
-from pathlib import Path
 from typing import Dict, List, Tuple
 
 import numpy as np
-from huggingface_hub import HfFolder, delete_repo
-from huggingface_hub.file_download import http_get
 from pytest import mark
-from requests.exceptions import HTTPError
 
 import transformers
 from transformers import (
-    AutoConfig,
     AutoModel,
     AutoModelForSequenceClassification,
     PretrainedConfig,
@@ -70,28 +60,20 @@ from transformers.models.auto.modeling_auto import (
     MODEL_MAPPING_NAMES,
 )
 from transformers.testing_utils import (
-    TOKEN,
-    USER,
     CaptureLogger,
-    TestCasePlus,
     is_pt_flax_cross_test,
     is_pt_tf_cross_test,
-    is_staging_test,
     require_accelerate,
     require_safetensors,
     require_torch,
     require_torch_gpu,
     require_torch_multi_gpu,
-    require_usr_bin_time,
     slow,
     torch_device,
 )
 from transformers.utils import (
     CONFIG_NAME,
     GENERATION_CONFIG_NAME,
-    SAFE_WEIGHTS_INDEX_NAME,
-    SAFE_WEIGHTS_NAME,
-    WEIGHTS_INDEX_NAME,
     WEIGHTS_NAME,
     is_accelerate_available,
     is_flax_available,
@@ -101,65 +83,17 @@ from transformers.utils import (
 from transformers.utils.generic import ModelOutput
 
 
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
-
-from test_module.custom_configuration import CustomConfig, NoSuperInitConfig  # noqa E402
-
-
 if is_accelerate_available():
     from accelerate.utils import compute_module_sizes
 
 
 if is_torch_available():
     import torch
-    from test_module.custom_modeling import CustomModel, NoSuperInitModel
     from torch import nn
 
-    from transformers import (
-        BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-        MODEL_MAPPING,
-        AdaptiveEmbedding,
-        AutoModelForCausalLM,
-        AutoTokenizer,
-        BertConfig,
-        BertModel,
-        CLIPTextModel,
-        PreTrainedModel,
-        T5Config,
-        T5ForConditionalGeneration,
-    )
-    from transformers.modeling_utils import shard_checkpoint
+    from transformers import MODEL_MAPPING, AdaptiveEmbedding
     from transformers.pytorch_utils import id_tensor_storage
 
-    # Fake pretrained models for tests
-    class BaseModel(PreTrainedModel):
-        config_class = PretrainedConfig
-
-        def __init__(self, config):
-            super().__init__(config)
-            self.linear = nn.Linear(4, 5)
-            self.linear_2 = nn.Linear(5, 6)
-
-        def forward(self, x):
-            return self.linear_2(self.linear(x))
-
-    class ModelWithHead(PreTrainedModel):
-        base_model_prefix = "base"
-        config_class = PretrainedConfig
-
-        def _init_weights(self, module):
-            pass
-
-        def __init__(self, config):
-            super().__init__(config)
-            self.base = BaseModel(config)
-            # linear is a common name between Base and Head on purpose.
-            self.linear = nn.Linear(6, 3)
-            self.linear2 = nn.Linear(3, 5)
-
-        def forward(self, x):
-            return self.linear2(self.linear(self.base(x)))
-
 
 if is_tf_available():
     import tensorflow as tf
@@ -187,10 +121,6 @@ def _config_zero_init(config):
     return configs_no_init
 
 
-TINY_T5 = "patrickvonplaten/t5-tiny-random"
-TINY_BERT_FOR_TOKEN_CLASSIFICATION = "hf-internal-testing/tiny-bert-for-token-classification"
-
-
 def _mock_init_weights(self, module):
     for name, param in module.named_parameters(recurse=False):
         # Use the first letter of the name to get a value and go from a <> -13 to z <> 12
@@ -2778,865 +2708,3 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None):
         values.append(rng.random() * scale)
 
     return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
-
-
-def check_models_equal(model1, model2):
-    models_are_equal = True
-    for model1_p, model2_p in zip(model1.parameters(), model2.parameters()):
-        if model1_p.data.ne(model2_p.data).sum() > 0:
-            models_are_equal = False
-
-    return models_are_equal
-
-
-@require_torch
-class ModelUtilsTest(TestCasePlus):
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            config = BertConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, PretrainedConfig)
-
-            model = BertModel.from_pretrained(model_name)
-            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, PreTrainedModel)
-
-            self.assertEqual(len(loading_info["missing_keys"]), 0)
-            self.assertEqual(len(loading_info["unexpected_keys"]), 8)
-            self.assertEqual(len(loading_info["mismatched_keys"]), 0)
-            self.assertEqual(len(loading_info["error_msgs"]), 0)
-
-            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-
-            # Not sure this is the intended behavior. TODO fix Lysandre & Thom
-            config.name_or_path = model_name
-
-            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-            self.assertEqual(model.config.output_hidden_states, True)
-            self.assertEqual(model.config, config)
-
-    def test_model_from_pretrained_subfolder(self):
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-        model = BertModel(config)
-
-        subfolder = "bert"
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(os.path.join(tmp_dir, subfolder))
-
-            with self.assertRaises(OSError):
-                _ = BertModel.from_pretrained(tmp_dir)
-
-            model_loaded = BertModel.from_pretrained(tmp_dir, subfolder=subfolder)
-
-        self.assertTrue(check_models_equal(model, model_loaded))
-
-    def test_model_from_pretrained_subfolder_sharded(self):
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-        model = BertModel(config)
-
-        subfolder = "bert"
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(os.path.join(tmp_dir, subfolder), max_shard_size="10KB")
-
-            with self.assertRaises(OSError):
-                _ = BertModel.from_pretrained(tmp_dir)
-
-            model_loaded = BertModel.from_pretrained(tmp_dir, subfolder=subfolder)
-
-        self.assertTrue(check_models_equal(model, model_loaded))
-
-    def test_model_from_pretrained_hub_subfolder(self):
-        subfolder = "bert"
-        model_id = "hf-internal-testing/tiny-random-bert-subfolder"
-        with self.assertRaises(OSError):
-            _ = BertModel.from_pretrained(model_id)
-
-        model = BertModel.from_pretrained(model_id, subfolder=subfolder)
-
-        self.assertIsNotNone(model)
-
-    def test_model_from_pretrained_hub_subfolder_sharded(self):
-        subfolder = "bert"
-        model_id = "hf-internal-testing/tiny-random-bert-sharded-subfolder"
-        with self.assertRaises(OSError):
-            _ = BertModel.from_pretrained(model_id)
-
-        model = BertModel.from_pretrained(model_id, subfolder=subfolder)
-
-        self.assertIsNotNone(model)
-
-    def test_model_from_pretrained_with_different_pretrained_model_name(self):
-        model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
-        self.assertIsNotNone(model)
-
-        logger = logging.get_logger("transformers.configuration_utils")
-        with CaptureLogger(logger) as cl:
-            BertModel.from_pretrained(TINY_T5)
-        self.assertTrue("You are using a model of type t5 to instantiate a model of type bert" in cl.out)
-
-    def test_model_from_config_torch_dtype(self):
-        # test that the model can be instantiated with dtype of user's choice - as long as it's a
-        # float dtype. To make it happen config.torch_dtype needs to be set before instantiating the
-        # model from the config object.
-
-        config = T5Config.from_pretrained(TINY_T5)
-        model = AutoModel.from_config(config)
-        # XXX: isn't supported
-        # model = T5ForConditionalGeneration.from_config(config)
-        self.assertEqual(model.dtype, torch.float32)
-
-        model = AutoModel.from_config(config, torch_dtype=torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-
-        # torch.set_default_dtype() supports only float dtypes, so will fail with non-float type
-        with self.assertRaises(ValueError):
-            model = AutoModel.from_config(config, torch_dtype=torch.int64)
-
-    def test_model_from_pretrained_torch_dtype(self):
-        # test that the model can be instantiated with dtype of either
-        # 1. explicit from_pretrained's torch_dtype argument
-        # 2. via autodiscovery by looking at model weights (torch_dtype="auto")
-        # so if a model.half() was saved, we want it to be instantiated as such.
-        #
-        # test an explicit model class, but also AutoModel separately as the latter goes through a different code path
-        model_path = self.get_auto_remove_tmp_dir()
-
-        # baseline - we know TINY_T5 is fp32 model
-        model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
-        self.assertEqual(model.dtype, torch.float32)
-
-        def remove_torch_dtype(model_path):
-            file = f"{model_path}/config.json"
-            with open(file, "r", encoding="utf-8") as f:
-                s = json.load(f)
-            s.pop("torch_dtype")
-            with open(file, "w", encoding="utf-8") as f:
-                json.dump(s, f)
-
-        # test the default fp32 save_pretrained => from_pretrained cycle
-        model.save_pretrained(model_path)
-        model = T5ForConditionalGeneration.from_pretrained(model_path)
-        self.assertEqual(model.dtype, torch.float32)
-        # 1. test torch_dtype="auto" via `config.torch_dtype`
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
-        self.assertEqual(model.dtype, torch.float32)
-        # 2. test torch_dtype="auto" via auto-derivation
-        # now remove the torch_dtype entry from config.json and try "auto" again which should
-        # perform auto-derivation from weights
-        remove_torch_dtype(model_path)
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
-        self.assertEqual(model.dtype, torch.float32)
-
-        # test forced loading in fp16 (even though the weights are in fp32)
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-
-        # test fp16 save_pretrained, loaded with auto-detection
-        model = model.half()
-        model.save_pretrained(model_path)
-        # 1. test torch_dtype="auto" via `config.torch_dtype`
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
-        self.assertEqual(model.config.torch_dtype, torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-        # tests `config.torch_dtype` saving
-        with open(f"{model_path}/config.json") as f:
-            config_dict = json.load(f)
-        self.assertEqual(config_dict["torch_dtype"], "float16")
-        # 2. test torch_dtype="auto" via auto-derivation
-        # now same with using config info
-        remove_torch_dtype(model_path)
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
-        self.assertEqual(model.dtype, torch.float16)
-
-        # 3. now retest that AutoModel behaves the same wrt torch_dtype="auto" as T5ForConditionalGeneration
-        model = AutoModel.from_pretrained(model_path, torch_dtype="auto")
-        self.assertEqual(model.dtype, torch.float16)
-
-        # test fp16 save_pretrained, loaded with the explicit fp16
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-
-        # test AutoModel separately as it goes through a different path
-        # test auto-detection - as currently TINY_T5 doesn't have torch_dtype entry
-        model = AutoModel.from_pretrained(TINY_T5, torch_dtype="auto")
-        # test that the config object didn't get polluted with torch_dtype="auto"
-        # there was a bug that after this call we ended up with config.torch_dtype=="auto"
-        self.assertNotEqual(model.config.torch_dtype, "auto")
-        # now test the outcome
-        self.assertEqual(model.dtype, torch.float32)
-        model = AutoModel.from_pretrained(TINY_T5, torch_dtype=torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-
-        # test model whose first param is not of a floating type, but int
-        model = AutoModel.from_pretrained(TINY_BERT_FOR_TOKEN_CLASSIFICATION, torch_dtype="auto")
-        self.assertEqual(model.dtype, torch.float32)
-
-    def test_no_super_init_config_and_model(self):
-        config = NoSuperInitConfig(attribute=32)
-        model = NoSuperInitModel(config)
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir)
-
-            new_model = NoSuperInitModel.from_pretrained(tmp_dir)
-
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-    def test_shard_checkpoint(self):
-        # This is the model we will use, total size 340,000 bytes.
-        model = torch.nn.Sequential(
-            torch.nn.Linear(100, 200, bias=False),  # size 80,000
-            torch.nn.Linear(200, 200, bias=False),  # size 160,000
-            torch.nn.Linear(200, 100, bias=False),  # size 80,000
-            torch.nn.Linear(100, 50, bias=False),  # size 20,000
-        )
-        state_dict = model.state_dict()
-
-        with self.subTest("No shard when max size is bigger than model size"):
-            shards, index = shard_checkpoint(state_dict)
-            self.assertIsNone(index)
-            self.assertDictEqual(shards, {WEIGHTS_NAME: state_dict})
-
-        with self.subTest("Test sharding, no weights bigger than max size"):
-            shards, index = shard_checkpoint(state_dict, max_shard_size="300kB")
-            # Split is first two layers then last two.
-            self.assertDictEqual(
-                index,
-                {
-                    "metadata": {"total_size": 340000},
-                    "weight_map": {
-                        "0.weight": "pytorch_model-00001-of-00002.bin",
-                        "1.weight": "pytorch_model-00001-of-00002.bin",
-                        "2.weight": "pytorch_model-00002-of-00002.bin",
-                        "3.weight": "pytorch_model-00002-of-00002.bin",
-                    },
-                },
-            )
-
-            shard1 = {"0.weight": state_dict["0.weight"], "1.weight": state_dict["1.weight"]}
-            shard2 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
-            self.assertDictEqual(
-                shards, {"pytorch_model-00001-of-00002.bin": shard1, "pytorch_model-00002-of-00002.bin": shard2}
-            )
-
-        with self.subTest("Test sharding with weights bigger than max size"):
-            shards, index = shard_checkpoint(state_dict, max_shard_size="100kB")
-            # Split is first layer, second layer then last 2.
-            self.assertDictEqual(
-                index,
-                {
-                    "metadata": {"total_size": 340000},
-                    "weight_map": {
-                        "0.weight": "pytorch_model-00001-of-00003.bin",
-                        "1.weight": "pytorch_model-00002-of-00003.bin",
-                        "2.weight": "pytorch_model-00003-of-00003.bin",
-                        "3.weight": "pytorch_model-00003-of-00003.bin",
-                    },
-                },
-            )
-
-            shard1 = {"0.weight": state_dict["0.weight"]}
-            shard2 = {"1.weight": state_dict["1.weight"]}
-            shard3 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
-            self.assertDictEqual(
-                shards,
-                {
-                    "pytorch_model-00001-of-00003.bin": shard1,
-                    "pytorch_model-00002-of-00003.bin": shard2,
-                    "pytorch_model-00003-of-00003.bin": shard3,
-                },
-            )
-
-    def test_checkpoint_sharding_local(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            # We use the same folder for various sizes to make sure a new save erases the old checkpoint.
-            for max_size in ["50kB", "50kiB", "100kB", "100kiB", "200kB", "200kiB"]:
-                model.save_pretrained(tmp_dir, max_shard_size=max_size)
-
-                # Get each shard file and its size
-                shard_to_size = {}
-                for shard in os.listdir(tmp_dir):
-                    if shard.endswith(".bin"):
-                        shard_file = os.path.join(tmp_dir, shard)
-                        shard_to_size[shard_file] = os.path.getsize(shard_file)
-
-                index_file = os.path.join(tmp_dir, WEIGHTS_INDEX_NAME)
-                # Check there is an index but no regular weight file
-                self.assertTrue(os.path.isfile(index_file))
-                self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
-
-                # Check a file is bigger than max_size only when it has a single weight
-                for shard_file, size in shard_to_size.items():
-                    if max_size.endswith("kiB"):
-                        max_size_int = int(max_size[:-3]) * 2**10
-                    else:
-                        max_size_int = int(max_size[:-2]) * 10**3
-                    # Note: pickle adds some junk so the weight of the file can end up being slightly bigger than
-                    # the size asked for (since we count parameters)
-                    if size >= max_size_int + 50000:
-                        state_dict = torch.load(shard_file)
-                        self.assertEqual(len(state_dict), 1)
-
-                # Check the index and the shard files found match
-                with open(index_file, "r", encoding="utf-8") as f:
-                    index = json.loads(f.read())
-
-                all_shards = set(index["weight_map"].values())
-                shards_found = {f for f in os.listdir(tmp_dir) if f.endswith(".bin")}
-                self.assertSetEqual(all_shards, shards_found)
-
-                # Finally, check the model can be reloaded
-                new_model = BertModel.from_pretrained(tmp_dir)
-                for p1, p2 in zip(model.parameters(), new_model.parameters()):
-                    self.assertTrue(torch.allclose(p1, p2))
-
-    def test_checkpoint_sharding_from_hub(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
-        # the model above is the same as the model below, just a sharded version.
-        ref_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        for p1, p2 in zip(model.parameters(), ref_model.parameters()):
-            self.assertTrue(torch.allclose(p1, p2))
-
-    def test_checkpoint_variant_local(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, variant="v2")
-
-            weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["bin"])
-
-            weights_file = os.path.join(tmp_dir, weights_name)
-            self.assertTrue(os.path.isfile(weights_file))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
-
-            with self.assertRaises(EnvironmentError):
-                _ = BertModel.from_pretrained(tmp_dir)
-
-            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
-
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.allclose(p1, p2))
-
-    def test_checkpoint_variant_local_sharded(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, variant="v2", max_shard_size="50kB")
-
-            weights_index_name = ".".join(WEIGHTS_INDEX_NAME.split(".")[:-1] + ["v2"] + ["json"])
-            weights_index_file = os.path.join(tmp_dir, weights_index_name)
-            self.assertTrue(os.path.isfile(weights_index_file))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_INDEX_NAME)))
-
-            for i in range(1, 6):
-                weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00006"] + ["bin"])
-                weights_name_file = os.path.join(tmp_dir, weights_name)
-                self.assertTrue(os.path.isfile(weights_name_file))
-
-            with self.assertRaises(EnvironmentError):
-                _ = BertModel.from_pretrained(tmp_dir)
-
-            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
-
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.allclose(p1, p2))
-
-    @require_safetensors
-    def test_checkpoint_variant_local_safe(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, variant="v2", safe_serialization=True)
-
-            weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["safetensors"])
-
-            weights_file = os.path.join(tmp_dir, weights_name)
-            self.assertTrue(os.path.isfile(weights_file))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-
-            with self.assertRaises(EnvironmentError):
-                _ = BertModel.from_pretrained(tmp_dir)
-
-            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
-
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.allclose(p1, p2))
-
-    @require_safetensors
-    def test_checkpoint_variant_local_sharded_safe(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, variant="v2", max_shard_size="50kB", safe_serialization=True)
-
-            weights_index_name = ".".join(SAFE_WEIGHTS_INDEX_NAME.split(".")[:-1] + ["v2"] + ["json"])
-            weights_index_file = os.path.join(tmp_dir, weights_index_name)
-            self.assertTrue(os.path.isfile(weights_index_file))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
-
-            for i in range(1, 6):
-                weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00006"] + ["safetensors"])
-                weights_name_file = os.path.join(tmp_dir, weights_name)
-                self.assertTrue(os.path.isfile(weights_name_file))
-
-            with self.assertRaises(EnvironmentError):
-                _ = BertModel.from_pretrained(tmp_dir)
-
-            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
-
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.allclose(p1, p2))
-
-    def test_checkpoint_variant_hub(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertRaises(EnvironmentError):
-                _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-variant", cache_dir=tmp_dir)
-            model = BertModel.from_pretrained(
-                "hf-internal-testing/tiny-random-bert-variant", cache_dir=tmp_dir, variant="v2"
-            )
-        self.assertIsNotNone(model)
-
-    def test_checkpoint_variant_hub_sharded(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertRaises(EnvironmentError):
-                _ = BertModel.from_pretrained(
-                    "hf-internal-testing/tiny-random-bert-variant-sharded", cache_dir=tmp_dir
-                )
-            model = BertModel.from_pretrained(
-                "hf-internal-testing/tiny-random-bert-variant-sharded", cache_dir=tmp_dir, variant="v2"
-            )
-        self.assertIsNotNone(model)
-
-    @require_safetensors
-    def test_checkpoint_variant_hub_safe(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertRaises(EnvironmentError):
-                _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-variant-safe", cache_dir=tmp_dir)
-            model = BertModel.from_pretrained(
-                "hf-internal-testing/tiny-random-bert-variant-safe", cache_dir=tmp_dir, variant="v2"
-            )
-        self.assertIsNotNone(model)
-
-    @require_safetensors
-    def test_checkpoint_variant_hub_sharded_safe(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertRaises(EnvironmentError):
-                _ = BertModel.from_pretrained(
-                    "hf-internal-testing/tiny-random-bert-variant-sharded-safe", cache_dir=tmp_dir
-                )
-            model = BertModel.from_pretrained(
-                "hf-internal-testing/tiny-random-bert-variant-sharded-safe", cache_dir=tmp_dir, variant="v2"
-            )
-        self.assertIsNotNone(model)
-
-    def test_checkpoint_variant_save_load(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model = BertModel.from_pretrained(
-                "hf-internal-testing/tiny-random-bert-variant", cache_dir=tmp_dir, variant="v2"
-            )
-            weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["bin"])
-
-            model.save_pretrained(tmp_dir, variant="v2")
-            # saving will create a variant checkpoint
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, weights_name)))
-
-            model.save_pretrained(tmp_dir)
-            # saving shouldn't delete variant checkpoints
-            weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["bin"])
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, weights_name)))
-
-            # there should be a normal checkpoint
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
-
-        self.assertIsNotNone(model)
-
-    @require_accelerate
-    @mark.accelerate_tests
-    def test_from_pretrained_low_cpu_mem_usage_functional(self):
-        # test that we can use `from_pretrained(..., low_cpu_mem_usage=True)` with normal and
-        # sharded models
-
-        mnames = [
-            "hf-internal-testing/tiny-random-bert-sharded",
-            "hf-internal-testing/tiny-random-bert",
-        ]
-        for mname in mnames:
-            _ = BertModel.from_pretrained(mname, low_cpu_mem_usage=True)
-
-    @require_usr_bin_time
-    @require_accelerate
-    @mark.accelerate_tests
-    def test_from_pretrained_low_cpu_mem_usage_measured(self):
-        # test that `from_pretrained(..., low_cpu_mem_usage=True)` uses less cpu memory than default
-
-        mname = "bert-base-cased"
-
-        preamble = "from transformers import AutoModel"
-        one_liner_str = f'{preamble}; AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=False)'
-        max_rss_normal = self.python_one_liner_max_rss(one_liner_str)
-        # print(f"{max_rss_normal=}")
-
-        one_liner_str = f'{preamble};  AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=True)'
-        max_rss_low_mem = self.python_one_liner_max_rss(one_liner_str)
-        # print(f"{max_rss_low_mem=}")
-
-        diff_bytes = max_rss_normal - max_rss_low_mem
-        diff_percent = diff_bytes / max_rss_low_mem
-        # print(f"{diff_bytes=}, {diff_percent=}")
-        # ideally we would compare that the diff is close to ~1x checkpoint size in bytes, but
-        # measuring cpu memory on linux is very tricky and inconsistent, so instead let's check that
-        # it's at least 15% less cpu memory consumed
-
-        self.assertGreater(
-            diff_percent,
-            0.15,
-            "should use less CPU memory for low_cpu_mem_usage=True, "
-            f"but got max_rss_normal={max_rss_normal} and max_rss_low_mem={max_rss_low_mem}",
-        )
-
-        # if you want to compare things manually, let's first look at the size of the model in bytes
-        # model = BertModel.from_pretrained(mname, low_cpu_mem_usage=False)
-        # total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
-        # total_bytes = total_numel * 4  # 420MB
-        # Now the diff_bytes should be very close to total_bytes, but the reports are inconsistent.
-        # The easiest way to test this is to switch the model and torch.load to do all the work on
-        # gpu - that way one can measure exactly the total and peak memory used. Perhaps once we add
-        # functionality to load models directly on gpu, this test can be rewritten to use torch's
-        # cuda memory tracking and then we should be able to do a much more precise test.
-
-    @require_accelerate
-    @mark.accelerate_tests
-    @require_torch_multi_gpu
-    @slow
-    def test_model_parallelism_gpt2(self):
-        device_map = {"transformer.wte": 0, "transformer.wpe": 0, "lm_head": 0, "transformer.ln_f": 1}
-        for i in range(12):
-            device_map[f"transformer.h.{i}"] = 0 if i <= 5 else 1
-
-        model = AutoModelForCausalLM.from_pretrained("gpt2", device_map=device_map)
-
-        tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        inputs = tokenizer("Hello, my name is", return_tensors="pt")
-        output = model.generate(inputs["input_ids"].to(0))
-
-        text_output = tokenizer.decode(output[0].tolist())
-        self.assertEqual(text_output, "Hello, my name is John. I'm a writer, and I'm a writer. I'm")
-
-    @require_accelerate
-    @mark.accelerate_tests
-    @require_torch_gpu
-    def test_from_pretrained_disk_offload_task_model(self):
-        model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        device_map = {
-            "transformer.wte": 0,
-            "transformer.wpe": 0,
-            "transformer.h.0": "cpu",
-            "transformer.h.1": "cpu",
-            "transformer.h.2": "cpu",
-            "transformer.h.3": "disk",
-            "transformer.h.4": "disk",
-            "transformer.ln_f": 0,
-            "lm_head": 0,
-        }
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            inputs = torch.tensor([[1, 2, 3]]).to(0)
-
-            model.save_pretrained(tmp_dir)
-            new_model = AutoModelForCausalLM.from_pretrained(tmp_dir).to(0)
-            outputs1 = new_model.to(0)(inputs)
-
-            offload_folder = os.path.join(tmp_dir, "offload")
-            new_model_with_offload = AutoModelForCausalLM.from_pretrained(
-                tmp_dir, device_map=device_map, offload_folder=offload_folder
-            )
-            outputs2 = new_model_with_offload(inputs)
-
-            self.assertTrue(torch.allclose(outputs1.logits.cpu(), outputs2.logits.cpu()))
-
-            # With state dict temp offload
-            offload_folder = os.path.join(tmp_dir, "offload")
-            new_model_with_offload = AutoModelForCausalLM.from_pretrained(
-                tmp_dir,
-                device_map=device_map,
-                offload_folder=offload_folder,
-                offload_state_dict=True,
-            )
-            outputs2 = new_model_with_offload(inputs)
-
-            self.assertTrue(torch.allclose(outputs1.logits.cpu(), outputs2.logits.cpu()))
-
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_load_from_one_file(self):
-        try:
-            tmp_file = tempfile.mktemp()
-            with open(tmp_file, "wb") as f:
-                http_get(
-                    "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/pytorch_model.bin", f
-                )
-
-            config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-            _ = BertModel.from_pretrained(tmp_file, config=config)
-        finally:
-            os.remove(tmp_file)
-
-    def test_legacy_load_from_url(self):
-        # This test is for deprecated behavior and can be removed in v5
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-        _ = BertModel.from_pretrained(
-            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/pytorch_model.bin", config=config
-        )
-
-    @require_safetensors
-    def test_use_safetensors(self):
-        # test nice error message if no safetensor files available
-        with self.assertRaises(OSError) as env_error:
-            AutoModel.from_pretrained("hf-internal-testing/tiny-random-RobertaModel", use_safetensors=True)
-
-        self.assertTrue(
-            "model.safetensors or model.safetensors.index.json and thus cannot be loaded with `safetensors`"
-            in str(env_error.exception)
-        )
-
-        # test that error if only safetensors is available
-        with self.assertRaises(OSError) as env_error:
-            BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors", use_safetensors=False)
-
-        self.assertTrue("does not appear to have a file named pytorch_model.bin" in str(env_error.exception))
-
-        # test that only safetensors if both available and use_safetensors=False
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            CLIPTextModel.from_pretrained(
-                "hf-internal-testing/diffusers-stable-diffusion-tiny-all",
-                subfolder="text_encoder",
-                use_safetensors=False,
-                cache_dir=tmp_dir,
-            )
-
-            all_downloaded_files = glob.glob(os.path.join(tmp_dir, "*", "snapshots", "*", "*", "*"))
-            self.assertTrue(any(f.endswith("bin") for f in all_downloaded_files))
-            self.assertFalse(any(f.endswith("safetensors") for f in all_downloaded_files))
-
-        # test that no safetensors if both available and use_safetensors=True
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            CLIPTextModel.from_pretrained(
-                "hf-internal-testing/diffusers-stable-diffusion-tiny-all",
-                subfolder="text_encoder",
-                use_safetensors=True,
-                cache_dir=tmp_dir,
-            )
-
-            all_downloaded_files = glob.glob(os.path.join(tmp_dir, "*", "snapshots", "*", "*", "*"))
-            self.assertTrue(any(f.endswith("safetensors") for f in all_downloaded_files))
-            self.assertFalse(any(f.endswith("bin") for f in all_downloaded_files))
-
-    @require_safetensors
-    def test_safetensors_save_and_load(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, safe_serialization=True)
-            # No pytorch_model.bin file, only a model.safetensors
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
-
-            new_model = BertModel.from_pretrained(tmp_dir)
-
-            # Check models are equal
-            for p1, p2 in zip(model.parameters(), new_model.parameters()):
-                self.assertTrue(torch.allclose(p1, p2))
-
-    @require_safetensors
-    def test_safetensors_load_from_hub(self):
-        safetensors_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors")
-        pytorch_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Check models are equal
-        for p1, p2 in zip(safetensors_model.parameters(), pytorch_model.parameters()):
-            self.assertTrue(torch.allclose(p1, p2))
-
-    @require_safetensors
-    def test_safetensors_save_and_load_sharded(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, safe_serialization=True, max_shard_size="100kB")
-            # No pytorch_model.bin index file, only a model.safetensors index
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_INDEX_NAME)))
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
-            # No regular weights file
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-
-            new_model = BertModel.from_pretrained(tmp_dir)
-
-            # Check models are equal
-            for p1, p2 in zip(model.parameters(), new_model.parameters()):
-                self.assertTrue(torch.allclose(p1, p2))
-
-    @require_safetensors
-    def test_safetensors_load_from_hub_sharded(self):
-        safetensors_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded-safetensors")
-        pytorch_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
-
-        # Check models are equal
-        for p1, p2 in zip(safetensors_model.parameters(), pytorch_model.parameters()):
-            self.assertTrue(torch.allclose(p1, p2))
-
-    def test_base_model_to_head_model_load(self):
-        base_model = BaseModel(PretrainedConfig())
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            base_model.save_pretrained(tmp_dir)
-
-            # Can load a base model in a model with head
-            model = ModelWithHead.from_pretrained(tmp_dir)
-            for p1, p2 in zip(model.base.parameters(), base_model.parameters()):
-                self.assertTrue(torch.allclose(p1, p2))
-
-            # It doesn't work if the state dict has a mix of keys of the head and base without prefix though.
-            base_state_dict = base_model.state_dict()
-            head_state_dict = model.state_dict()
-            base_state_dict["linear2.weight"] = head_state_dict["linear2.weight"]
-            base_state_dict["linear2.bias"] = head_state_dict["linear2.bias"]
-            torch.save(base_state_dict, os.path.join(tmp_dir, WEIGHTS_NAME))
-
-            with self.assertRaisesRegex(
-                ValueError, "The state dictionary of the model you are trying to load is corrupted."
-            ):
-                _ = ModelWithHead.from_pretrained(tmp_dir)
-
-    @require_torch_gpu
-    @slow
-    def test_pretrained_low_mem_new_config(self):
-        # Checking for 1 model(the same one which was described in the issue) .
-        model_ids = ["gpt2"]
-
-        for model_id in model_ids:
-            model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_id)
-            model_config.n_layer = 48
-            model_config.n_head = 25
-            model_config.n_embd = 1600
-            model = AutoModelForCausalLM.from_pretrained(
-                pretrained_model_name_or_path=model_id,
-                config=model_config,
-                ignore_mismatched_sizes=True,
-                torch_dtype=torch.float16,
-                low_cpu_mem_usage=True,
-            )
-            model_ref = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id)
-
-            self.assertEqual(model.__class__.__name__, model_ref.__class__.__name__)
-
-
-@require_torch
-@is_staging_test
-class ModelPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-model")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-model-org")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-dynamic-model")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = BertModel(config)
-        model.push_to_hub("test-model", use_auth_token=self._token)
-
-        new_model = BertModel.from_pretrained(f"{USER}/test-model")
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-model")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, repo_id="test-model", push_to_hub=True, use_auth_token=self._token)
-
-        new_model = BertModel.from_pretrained(f"{USER}/test-model")
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-    def test_push_to_hub_in_organization(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = BertModel(config)
-        model.push_to_hub("valid_org/test-model-org", use_auth_token=self._token)
-
-        new_model = BertModel.from_pretrained("valid_org/test-model-org")
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-model-org")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                tmp_dir, push_to_hub=True, use_auth_token=self._token, repo_id="valid_org/test-model-org"
-            )
-
-        new_model = BertModel.from_pretrained("valid_org/test-model-org")
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-    def test_push_to_hub_dynamic_model(self):
-        CustomConfig.register_for_auto_class()
-        CustomModel.register_for_auto_class()
-
-        config = CustomConfig(hidden_size=32)
-        model = CustomModel(config)
-
-        model.push_to_hub("test-dynamic-model", use_auth_token=self._token)
-        # checks
-        self.assertDictEqual(
-            config.auto_map,
-            {"AutoConfig": "custom_configuration.CustomConfig", "AutoModel": "custom_modeling.CustomModel"},
-        )
-
-        new_model = AutoModel.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True)
-        # Can't make an isinstance check because the new_model is from the CustomModel class of a dynamic module
-        self.assertEqual(new_model.__class__.__name__, "CustomModel")
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-        config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True)
-        new_model = AutoModel.from_config(config, trust_remote_code=True)
-        self.assertEqual(new_model.__class__.__name__, "CustomModel")
diff --git a/tests/test_modeling_flax_common.py b/tests/test_modeling_flax_common.py
index 058d790611..58ada0226a 100644
--- a/tests/test_modeling_flax_common.py
+++ b/tests/test_modeling_flax_common.py
@@ -17,25 +17,14 @@ import inspect
 import json
 import random
 import tempfile
-import unittest
 from typing import List, Tuple
 
 import numpy as np
-from huggingface_hub import HfFolder, delete_repo
-from requests.exceptions import HTTPError
 
 import transformers
-from transformers import BertConfig, is_flax_available, is_torch_available
+from transformers import is_flax_available, is_torch_available
 from transformers.models.auto import get_values
-from transformers.testing_utils import (
-    TOKEN,
-    USER,
-    CaptureLogger,
-    is_pt_flax_cross_test,
-    is_staging_test,
-    require_flax,
-    torch_device,
-)
+from transformers.testing_utils import CaptureLogger, is_pt_flax_cross_test, require_flax, torch_device
 from transformers.utils import CONFIG_NAME, GENERATION_CONFIG_NAME, logging
 from transformers.utils.generic import ModelOutput
 
@@ -69,14 +58,6 @@ if is_torch_available():
     import torch
 
 
-def _config_zero_init(config):
-    configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
-        if "_range" in key or "_std" in key or "initializer_factor" in key:
-            setattr(configs_no_init, key, 1e-10)
-    return configs_no_init
-
-
 def ids_tensor(shape, vocab_size, rng=None):
     """Creates a random int32 tensor of the shape within the vocab size."""
     if rng is None:
@@ -1164,155 +1145,3 @@ class FlaxModelTesterMixin:
             # ensure that the outputs remain precisely equal
             for output, remat_output in zip(outputs, remat_outputs):
                 self.assertTrue((output == remat_output).all())
-
-
-@require_flax
-@is_staging_test
-class FlaxModelPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-model-flax")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-model-flax-org")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = FlaxBertModel(config)
-        model.push_to_hub("test-model-flax", use_auth_token=self._token)
-
-        new_model = FlaxBertModel.from_pretrained(f"{USER}/test-model-flax")
-
-        base_params = flatten_dict(unfreeze(model.params))
-        new_params = flatten_dict(unfreeze(new_model.params))
-
-        for key in base_params.keys():
-            max_diff = (base_params[key] - new_params[key]).sum().item()
-            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-model-flax")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, repo_id="test-model-flax", push_to_hub=True, use_auth_token=self._token)
-
-        new_model = FlaxBertModel.from_pretrained(f"{USER}/test-model-flax")
-
-        base_params = flatten_dict(unfreeze(model.params))
-        new_params = flatten_dict(unfreeze(new_model.params))
-
-        for key in base_params.keys():
-            max_diff = (base_params[key] - new_params[key]).sum().item()
-            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    def test_push_to_hub_in_organization(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = FlaxBertModel(config)
-        model.push_to_hub("valid_org/test-model-flax-org", use_auth_token=self._token)
-
-        new_model = FlaxBertModel.from_pretrained("valid_org/test-model-flax-org")
-
-        base_params = flatten_dict(unfreeze(model.params))
-        new_params = flatten_dict(unfreeze(new_model.params))
-
-        for key in base_params.keys():
-            max_diff = (base_params[key] - new_params[key]).sum().item()
-            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-model-flax-org")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                tmp_dir, repo_id="valid_org/test-model-flax-org", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_model = FlaxBertModel.from_pretrained("valid_org/test-model-flax-org")
-
-        base_params = flatten_dict(unfreeze(model.params))
-        new_params = flatten_dict(unfreeze(new_model.params))
-
-        for key in base_params.keys():
-            max_diff = (base_params[key] - new_params[key]).sum().item()
-            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-
-def check_models_equal(model1, model2):
-    models_are_equal = True
-    flat_params_1 = flatten_dict(model1.params)
-    flat_params_2 = flatten_dict(model2.params)
-    for key in flat_params_1.keys():
-        if np.sum(np.abs(flat_params_1[key] - flat_params_2[key])) > 1e-4:
-            models_are_equal = False
-
-    return models_are_equal
-
-
-@require_flax
-class FlaxModelUtilsTest(unittest.TestCase):
-    def test_model_from_pretrained_subfolder(self):
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
-        model = FlaxBertModel(config)
-
-        subfolder = "bert"
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(os.path.join(tmp_dir, subfolder))
-
-            with self.assertRaises(OSError):
-                _ = FlaxBertModel.from_pretrained(tmp_dir)
-
-            model_loaded = FlaxBertModel.from_pretrained(tmp_dir, subfolder=subfolder)
-
-        self.assertTrue(check_models_equal(model, model_loaded))
-
-    def test_model_from_pretrained_subfolder_sharded(self):
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
-        model = FlaxBertModel(config)
-
-        subfolder = "bert"
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(os.path.join(tmp_dir, subfolder), max_shard_size="10KB")
-
-            with self.assertRaises(OSError):
-                _ = FlaxBertModel.from_pretrained(tmp_dir)
-
-            model_loaded = FlaxBertModel.from_pretrained(tmp_dir, subfolder=subfolder)
-
-        self.assertTrue(check_models_equal(model, model_loaded))
-
-    def test_model_from_pretrained_hub_subfolder(self):
-        subfolder = "bert"
-        model_id = "hf-internal-testing/tiny-random-bert-subfolder"
-
-        with self.assertRaises(OSError):
-            _ = FlaxBertModel.from_pretrained(model_id)
-
-        model = FlaxBertModel.from_pretrained(model_id, subfolder=subfolder)
-
-        self.assertIsNotNone(model)
-
-    def test_model_from_pretrained_hub_subfolder_sharded(self):
-        subfolder = "bert"
-        model_id = "hf-internal-testing/tiny-random-bert-sharded-subfolder"
-        with self.assertRaises(OSError):
-            _ = FlaxBertModel.from_pretrained(model_id)
-
-        model = FlaxBertModel.from_pretrained(model_id, subfolder=subfolder)
-
-        self.assertIsNotNone(model)
diff --git a/tests/test_modeling_flax_utils.py b/tests/test_modeling_flax_utils.py
new file mode 100644
index 0000000000..d8fb71a610
--- /dev/null
+++ b/tests/test_modeling_flax_utils.py
@@ -0,0 +1,186 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+
+import numpy as np
+from huggingface_hub import HfFolder, delete_repo
+from requests.exceptions import HTTPError
+
+from transformers import BertConfig, is_flax_available
+from transformers.testing_utils import TOKEN, USER, is_staging_test, require_flax
+
+
+if is_flax_available():
+    import os
+
+    from flax.core.frozen_dict import unfreeze
+    from flax.traverse_util import flatten_dict
+
+    from transformers import FlaxBertModel
+
+    os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "0.12"  # assumed parallelism: 8
+
+
+@require_flax
+@is_staging_test
+class FlaxModelPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-model-flax")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-model-flax-org")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = FlaxBertModel(config)
+        model.push_to_hub("test-model-flax", use_auth_token=self._token)
+
+        new_model = FlaxBertModel.from_pretrained(f"{USER}/test-model-flax")
+
+        base_params = flatten_dict(unfreeze(model.params))
+        new_params = flatten_dict(unfreeze(new_model.params))
+
+        for key in base_params.keys():
+            max_diff = (base_params[key] - new_params[key]).sum().item()
+            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-model-flax")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, repo_id="test-model-flax", push_to_hub=True, use_auth_token=self._token)
+
+        new_model = FlaxBertModel.from_pretrained(f"{USER}/test-model-flax")
+
+        base_params = flatten_dict(unfreeze(model.params))
+        new_params = flatten_dict(unfreeze(new_model.params))
+
+        for key in base_params.keys():
+            max_diff = (base_params[key] - new_params[key]).sum().item()
+            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+    def test_push_to_hub_in_organization(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = FlaxBertModel(config)
+        model.push_to_hub("valid_org/test-model-flax-org", use_auth_token=self._token)
+
+        new_model = FlaxBertModel.from_pretrained("valid_org/test-model-flax-org")
+
+        base_params = flatten_dict(unfreeze(model.params))
+        new_params = flatten_dict(unfreeze(new_model.params))
+
+        for key in base_params.keys():
+            max_diff = (base_params[key] - new_params[key]).sum().item()
+            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-model-flax-org")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(
+                tmp_dir, repo_id="valid_org/test-model-flax-org", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_model = FlaxBertModel.from_pretrained("valid_org/test-model-flax-org")
+
+        base_params = flatten_dict(unfreeze(model.params))
+        new_params = flatten_dict(unfreeze(new_model.params))
+
+        for key in base_params.keys():
+            max_diff = (base_params[key] - new_params[key]).sum().item()
+            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+
+def check_models_equal(model1, model2):
+    models_are_equal = True
+    flat_params_1 = flatten_dict(model1.params)
+    flat_params_2 = flatten_dict(model2.params)
+    for key in flat_params_1.keys():
+        if np.sum(np.abs(flat_params_1[key] - flat_params_2[key])) > 1e-4:
+            models_are_equal = False
+
+    return models_are_equal
+
+
+@require_flax
+class FlaxModelUtilsTest(unittest.TestCase):
+    def test_model_from_pretrained_subfolder(self):
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
+        model = FlaxBertModel(config)
+
+        subfolder = "bert"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(os.path.join(tmp_dir, subfolder))
+
+            with self.assertRaises(OSError):
+                _ = FlaxBertModel.from_pretrained(tmp_dir)
+
+            model_loaded = FlaxBertModel.from_pretrained(tmp_dir, subfolder=subfolder)
+
+        self.assertTrue(check_models_equal(model, model_loaded))
+
+    def test_model_from_pretrained_subfolder_sharded(self):
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
+        model = FlaxBertModel(config)
+
+        subfolder = "bert"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(os.path.join(tmp_dir, subfolder), max_shard_size="10KB")
+
+            with self.assertRaises(OSError):
+                _ = FlaxBertModel.from_pretrained(tmp_dir)
+
+            model_loaded = FlaxBertModel.from_pretrained(tmp_dir, subfolder=subfolder)
+
+        self.assertTrue(check_models_equal(model, model_loaded))
+
+    def test_model_from_pretrained_hub_subfolder(self):
+        subfolder = "bert"
+        model_id = "hf-internal-testing/tiny-random-bert-subfolder"
+
+        with self.assertRaises(OSError):
+            _ = FlaxBertModel.from_pretrained(model_id)
+
+        model = FlaxBertModel.from_pretrained(model_id, subfolder=subfolder)
+
+        self.assertIsNotNone(model)
+
+    def test_model_from_pretrained_hub_subfolder_sharded(self):
+        subfolder = "bert"
+        model_id = "hf-internal-testing/tiny-random-bert-sharded-subfolder"
+        with self.assertRaises(OSError):
+            _ = FlaxBertModel.from_pretrained(model_id)
+
+        model = FlaxBertModel.from_pretrained(model_id, subfolder=subfolder)
+
+        self.assertIsNotNone(model)
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 9ebbe86bf0..a4de6e8f47 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -23,42 +23,24 @@ import os
 import random
 import tempfile
 import unittest
-import unittest.mock as mock
 from importlib import import_module
 from math import isnan
 from typing import List, Tuple
 
 from datasets import Dataset
-from huggingface_hub import HfFolder, Repository, delete_repo
-from huggingface_hub.file_download import http_get
-from requests.exceptions import HTTPError
 
 from transformers import is_tf_available, is_torch_available
-from transformers.configuration_utils import PretrainedConfig
 from transformers.models.auto import get_values
 from transformers.testing_utils import (  # noqa: F401
-    TOKEN,
-    USER,
     CaptureLogger,
-    CaptureStdout,
     _tf_gpu_memory_limit,
     is_pt_tf_cross_test,
-    is_staging_test,
-    require_safetensors,
     require_tf,
     require_tf2onnx,
     slow,
-    tooslow,
     torch_device,
 )
-from transformers.utils import (
-    CONFIG_NAME,
-    GENERATION_CONFIG_NAME,
-    SAFE_WEIGHTS_NAME,
-    TF2_WEIGHTS_INDEX_NAME,
-    TF2_WEIGHTS_NAME,
-    logging,
-)
+from transformers.utils import CONFIG_NAME, GENERATION_CONFIG_NAME, logging
 from transformers.utils.generic import ModelOutput
 
 
@@ -66,7 +48,6 @@ logger = logging.get_logger(__name__)
 
 
 if is_tf_available():
-    import h5py
     import numpy as np
     import tensorflow as tf
 
@@ -85,17 +66,8 @@ if is_tf_available():
         TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
         TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
         TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        BertConfig,
-        PreTrainedModel,
-        PushToHubCallback,
-        RagRetriever,
         TFAutoModel,
         TFAutoModelForSequenceClassification,
-        TFBertForMaskedLM,
-        TFBertForSequenceClassification,
-        TFBertModel,
-        TFPreTrainedModel,
-        TFRagModel,
         TFSharedEmbeddings,
     )
     from transformers.generation import (
@@ -108,8 +80,6 @@ if is_tf_available():
         TFSampleDecoderOnlyOutput,
         TFSampleEncoderDecoderOutput,
     )
-    from transformers.modeling_tf_utils import tf_shard_checkpoint, unpack_inputs
-    from transformers.tf_utils import stable_softmax
 
     tf.config.experimental.enable_tensor_float_32_execution(False)
 
@@ -130,8 +100,6 @@ if is_tf_available():
 if is_torch_available():
     import torch
 
-    from transformers import BertModel
-
 
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
@@ -1995,544 +1963,3 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None):
         values.append(rng.random() * scale)
 
     return tf.reshape(tf.constant(values, dtype=dtype if dtype is not None else tf.float32), shape=shape)
-
-
-@require_tf
-class UtilsFunctionsTest(unittest.TestCase):
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            _ = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_load_from_one_file(self):
-        try:
-            tmp_file = tempfile.mktemp()
-            with open(tmp_file, "wb") as f:
-                http_get("https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/tf_model.h5", f)
-
-            config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-            _ = TFBertModel.from_pretrained(tmp_file, config=config)
-        finally:
-            os.remove(tmp_file)
-
-    def test_legacy_load_from_url(self):
-        # This test is for deprecated behavior and can be removed in v5
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-        _ = TFBertModel.from_pretrained(
-            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/tf_model.h5", config=config
-        )
-
-    # tests whether the unpack_inputs function behaves as expected
-    def test_unpack_inputs(self):
-        class DummyModel:
-            def __init__(self):
-                config_kwargs = {"output_attentions": False, "output_hidden_states": False, "return_dict": False}
-                self.config = PretrainedConfig(**config_kwargs)
-                self.main_input_name = "input_ids"
-
-            @unpack_inputs
-            def call(
-                self,
-                input_ids=None,
-                past_key_values=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-            ):
-                return input_ids, past_key_values, output_attentions, output_hidden_states, return_dict
-
-            @unpack_inputs
-            def foo(self, pixel_values, output_attentions=None, output_hidden_states=None, return_dict=None):
-                return pixel_values, output_attentions, output_hidden_states, return_dict
-
-        dummy_model = DummyModel()
-        input_ids = tf.constant([0, 1, 2, 3], dtype=tf.int32)
-        past_key_values = tf.constant([4, 5, 6, 7], dtype=tf.int32)
-        pixel_values = tf.constant([8, 9, 10, 11], dtype=tf.int32)
-
-        # test case 1: Pass inputs as keyword arguments; Booleans are inherited from the config.
-        output = dummy_model.call(input_ids=input_ids, past_key_values=past_key_values)
-        tf.debugging.assert_equal(output[0], input_ids)
-        tf.debugging.assert_equal(output[1], past_key_values)
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-        self.assertFalse(output[4])
-
-        # test case 2: Same as above, but with positional arguments.
-        output = dummy_model.call(input_ids, past_key_values)
-        tf.debugging.assert_equal(output[0], input_ids)
-        tf.debugging.assert_equal(output[1], past_key_values)
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-        self.assertFalse(output[4])
-
-        # test case 3: We can also pack everything in the first input.
-        output = dummy_model.call(input_ids={"input_ids": input_ids, "past_key_values": past_key_values})
-        tf.debugging.assert_equal(output[0], input_ids)
-        tf.debugging.assert_equal(output[1], past_key_values)
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-        self.assertFalse(output[4])
-
-        # test case 4: Explicit boolean arguments should override the config.
-        output = dummy_model.call(
-            input_ids=input_ids, past_key_values=past_key_values, output_attentions=False, return_dict=True
-        )
-        tf.debugging.assert_equal(output[0], input_ids)
-        tf.debugging.assert_equal(output[1], past_key_values)
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-        self.assertTrue(output[4])
-
-        # test case 5: Unexpected arguments should raise an exception.
-        with self.assertRaises(ValueError):
-            output = dummy_model.call(input_ids=input_ids, past_key_values=past_key_values, foo="bar")
-
-        # test case 6: the decorator is independent from `main_input_name` -- it treats the first argument of the
-        # decorated function as its main input.
-        output = dummy_model.foo(pixel_values=pixel_values)
-        tf.debugging.assert_equal(output[0], pixel_values)
-        self.assertFalse(output[1])
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-
-    # Tests whether the stable softmax is stable on CPU, with and without XLA
-    def test_xla_stable_softmax(self):
-        large_penalty = -1e9
-        n_tokens = 10
-        batch_size = 8
-
-        def masked_softmax(x, boolean_mask):
-            numerical_mask = (1.0 - tf.cast(boolean_mask, dtype=tf.float32)) * large_penalty
-            masked_x = x + numerical_mask
-            return stable_softmax(masked_x)
-
-        xla_masked_softmax = tf.function(masked_softmax, jit_compile=True)
-        xla_stable_softmax = tf.function(stable_softmax, jit_compile=True)
-        x = tf.random.normal((batch_size, n_tokens))
-
-        # Same outcome regardless of the boolean mask here
-        masked_tokens = random.randint(0, n_tokens)
-        boolean_mask = tf.convert_to_tensor([[1] * (n_tokens - masked_tokens) + [0] * masked_tokens], dtype=tf.int32)
-
-        # We can randomly mask a random numerical input OUTSIDE XLA
-        numerical_mask = (1.0 - tf.cast(boolean_mask, dtype=tf.float32)) * large_penalty
-        masked_x = x + numerical_mask
-        xla_out = xla_stable_softmax(masked_x)
-        out = stable_softmax(masked_x)
-        assert tf.experimental.numpy.allclose(xla_out, out)
-
-        # The stable softmax has the same output as the original softmax
-        unstable_out = tf.nn.softmax(masked_x)
-        assert tf.experimental.numpy.allclose(unstable_out, out)
-
-        # We can randomly mask a random numerical input INSIDE XLA
-        xla_out = xla_masked_softmax(x, boolean_mask)
-        out = masked_softmax(x, boolean_mask)
-        assert tf.experimental.numpy.allclose(xla_out, out)
-
-    def test_checkpoint_sharding_from_hub(self):
-        model = TFBertModel.from_pretrained("ArthurZ/tiny-random-bert-sharded")
-        # the model above is the same as the model below, just a sharded version.
-        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        for p1, p2 in zip(model.weights, ref_model.weights):
-            assert np.allclose(p1.numpy(), p2.numpy())
-
-    def test_sharded_checkpoint_with_prefix(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert", load_weight_prefix="a/b")
-        sharded_model = TFBertModel.from_pretrained("ArthurZ/tiny-random-bert-sharded", load_weight_prefix="a/b")
-        for p1, p2 in zip(model.weights, sharded_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-            self.assertTrue(p1.name.startswith("a/b/"))
-            self.assertTrue(p2.name.startswith("a/b/"))
-
-    def test_sharded_checkpoint_transfer(self):
-        # If this doesn't throw an error then the test passes
-        TFBertForSequenceClassification.from_pretrained("ArthurZ/tiny-random-bert-sharded")
-
-    @is_pt_tf_cross_test
-    def test_checkpoint_sharding_local_from_pt(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            _ = Repository(local_dir=tmp_dir, clone_from="hf-internal-testing/tiny-random-bert-sharded")
-            model = TFBertModel.from_pretrained(tmp_dir, from_pt=True)
-            # the model above is the same as the model below, just a sharded pytorch version.
-            ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-            for p1, p2 in zip(model.weights, ref_model.weights):
-                assert np.allclose(p1.numpy(), p2.numpy())
-
-    @is_pt_tf_cross_test
-    def test_checkpoint_loading_with_prefix_from_pt(self):
-        model = TFBertModel.from_pretrained(
-            "hf-internal-testing/tiny-random-bert", from_pt=True, load_weight_prefix="a/b"
-        )
-        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert", from_pt=True)
-        for p1, p2 in zip(model.weights, ref_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-            self.assertTrue(p1.name.startswith("a/b/"))
-
-    @is_pt_tf_cross_test
-    def test_checkpoint_sharding_hub_from_pt(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded", from_pt=True)
-        # the model above is the same as the model below, just a sharded pytorch version.
-        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        for p1, p2 in zip(model.weights, ref_model.weights):
-            assert np.allclose(p1.numpy(), p2.numpy())
-
-    def test_shard_checkpoint(self):
-        # This is the model we will use, total size 340,000 bytes.
-        model = tf.keras.Sequential(
-            [
-                tf.keras.layers.Dense(200, use_bias=False),  # size 80,000
-                tf.keras.layers.Dense(200, use_bias=False),  # size 160,000
-                tf.keras.layers.Dense(100, use_bias=False),  # size 80,000
-                tf.keras.layers.Dense(50, use_bias=False),  # size 20,000
-            ]
-        )
-        inputs = tf.zeros((1, 100), dtype=tf.float32)
-        model(inputs)
-        weights = model.weights
-        weights_dict = {w.name: w for w in weights}
-        with self.subTest("No shard when max size is bigger than model size"):
-            shards, index = tf_shard_checkpoint(weights)
-            self.assertIsNone(index)
-            self.assertDictEqual(shards, {TF2_WEIGHTS_NAME: weights})
-
-        with self.subTest("Test sharding, no weights bigger than max size"):
-            shards, index = tf_shard_checkpoint(weights, max_shard_size="300kB")
-            # Split is first two layers then last two.
-            self.assertDictEqual(
-                index,
-                {
-                    "metadata": {"total_size": 340000},
-                    "weight_map": {
-                        "dense/kernel:0": "tf_model-00001-of-00002.h5",
-                        "dense_1/kernel:0": "tf_model-00001-of-00002.h5",
-                        "dense_2/kernel:0": "tf_model-00002-of-00002.h5",
-                        "dense_3/kernel:0": "tf_model-00002-of-00002.h5",
-                    },
-                },
-            )
-
-            shard1 = [weights_dict["dense/kernel:0"], weights_dict["dense_1/kernel:0"]]
-            shard2 = [weights_dict["dense_2/kernel:0"], weights_dict["dense_3/kernel:0"]]
-            self.assertDictEqual(shards, {"tf_model-00001-of-00002.h5": shard1, "tf_model-00002-of-00002.h5": shard2})
-
-        with self.subTest("Test sharding with weights bigger than max size"):
-            shards, index = tf_shard_checkpoint(weights, max_shard_size="100kB")
-            # Split is first layer, second layer then last 2.
-            self.assertDictEqual(
-                index,
-                {
-                    "metadata": {"total_size": 340000},
-                    "weight_map": {
-                        "dense/kernel:0": "tf_model-00001-of-00003.h5",
-                        "dense_1/kernel:0": "tf_model-00002-of-00003.h5",
-                        "dense_2/kernel:0": "tf_model-00003-of-00003.h5",
-                        "dense_3/kernel:0": "tf_model-00003-of-00003.h5",
-                    },
-                },
-            )
-
-            shard1 = [weights_dict["dense/kernel:0"]]
-            shard2 = [weights_dict["dense_1/kernel:0"]]
-            shard3 = [weights_dict["dense_2/kernel:0"], weights_dict["dense_3/kernel:0"]]
-            self.assertDictEqual(
-                shards,
-                {
-                    "tf_model-00001-of-00003.h5": shard1,
-                    "tf_model-00002-of-00003.h5": shard2,
-                    "tf_model-00003-of-00003.h5": shard3,
-                },
-            )
-
-    @slow
-    def test_special_layer_name_sharding(self):
-        retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
-        model = TFRagModel.from_pretrained("facebook/rag-token-nq", retriever=retriever)
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            for max_size in ["150kB", "150kiB", "200kB", "200kiB"]:
-                model.save_pretrained(tmp_dir, max_shard_size=max_size)
-                ref_model = TFRagModel.from_pretrained(tmp_dir, retriever=retriever)
-                for p1, p2 in zip(model.weights, ref_model.weights):
-                    assert np.allclose(p1.numpy(), p2.numpy())
-
-    def test_checkpoint_sharding_local(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            # We use the same folder for various sizes to make sure a new save erases the old checkpoint.
-            for max_size in ["150kB", "150kiB", "200kB", "200kiB"]:
-                model.save_pretrained(tmp_dir, max_shard_size=max_size)
-
-                # Get each shard file and its size
-                shard_to_size = {}
-                for shard in os.listdir(tmp_dir):
-                    if shard.endswith(".h5"):
-                        shard_file = os.path.join(tmp_dir, shard)
-                        shard_to_size[shard_file] = os.path.getsize(shard_file)
-
-                index_file = os.path.join(tmp_dir, TF2_WEIGHTS_INDEX_NAME)
-                # Check there is an index but no regular weight file
-                self.assertTrue(os.path.isfile(index_file))
-                self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_NAME)))
-
-                # Check a file is bigger than max_size only when it has a single weight
-                for shard_file, size in shard_to_size.items():
-                    if max_size.endswith("kiB"):
-                        max_size_int = int(max_size[:-3]) * 2**10
-                    else:
-                        max_size_int = int(max_size[:-2]) * 10**3
-                    # Note: pickle adds some junk so the weight of the file can end up being slightly bigger than
-                    # the size asked for (since we count parameters)
-                    if size >= max_size_int + 50000:
-                        with h5py.File(shard_file, "r") as state_file:
-                            self.assertEqual(len(state_file), 1)
-
-                # Check the index and the shard files found match
-                with open(index_file, "r", encoding="utf-8") as f:
-                    index = json.loads(f.read())
-
-                all_shards = set(index["weight_map"].values())
-                shards_found = {f for f in os.listdir(tmp_dir) if f.endswith(".h5")}
-                self.assertSetEqual(all_shards, shards_found)
-
-                # Finally, check the model can be reloaded
-                new_model = TFBertModel.from_pretrained(tmp_dir)
-
-                model.build()
-                new_model.build()
-
-                for p1, p2 in zip(model.weights, new_model.weights):
-                    self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    @slow
-    def test_save_pretrained_signatures(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Short custom TF signature function.
-        # `input_signature` is specific to BERT.
-        @tf.function(
-            input_signature=[
-                [
-                    tf.TensorSpec([None, None], tf.int32, name="input_ids"),
-                    tf.TensorSpec([None, None], tf.int32, name="token_type_ids"),
-                    tf.TensorSpec([None, None], tf.int32, name="attention_mask"),
-                ]
-            ]
-        )
-        def serving_fn(input):
-            return model(input)
-
-        # Using default signature (default behavior) overrides 'serving_default'
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, saved_model=True, signatures=None)
-            model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1")
-            self.assertTrue("serving_default" in list(model_loaded.signatures.keys()))
-
-        # Providing custom signature function
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, saved_model=True, signatures={"custom_signature": serving_fn})
-            model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1")
-            self.assertTrue("custom_signature" in list(model_loaded.signatures.keys()))
-
-        # Providing multiple custom signature function
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                tmp_dir,
-                saved_model=True,
-                signatures={"custom_signature_1": serving_fn, "custom_signature_2": serving_fn},
-            )
-            model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1")
-            self.assertTrue("custom_signature_1" in list(model_loaded.signatures.keys()))
-            self.assertTrue("custom_signature_2" in list(model_loaded.signatures.keys()))
-
-    @require_safetensors
-    def test_safetensors_save_and_load(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, safe_serialization=True)
-            # No tf_model.h5 file, only a model.safetensors
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_NAME)))
-
-            new_model = TFBertModel.from_pretrained(tmp_dir)
-
-            # Check models are equal
-            for p1, p2 in zip(model.weights, new_model.weights):
-                self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    @is_pt_tf_cross_test
-    def test_safetensors_save_and_load_pt_to_tf(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        pt_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            pt_model.save_pretrained(tmp_dir, safe_serialization=True)
-            # Check we have a model.safetensors file
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-
-            new_model = TFBertModel.from_pretrained(tmp_dir)
-
-            # Check models are equal
-            for p1, p2 in zip(model.weights, new_model.weights):
-                self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    @require_safetensors
-    def test_safetensors_load_from_hub(self):
-        tf_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Can load from the TF-formatted checkpoint
-        safetensors_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors-tf")
-
-        # Check models are equal
-        for p1, p2 in zip(safetensors_model.weights, tf_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-        # Can load from the PyTorch-formatted checkpoint
-        safetensors_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors")
-
-        # Check models are equal
-        for p1, p2 in zip(safetensors_model.weights, tf_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-
-@require_tf
-@is_staging_test
-class TFModelPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-model-tf")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-model-tf-callback")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-model-tf-org")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = TFBertModel(config)
-        # Make sure model is properly initialized
-        model.build()
-
-        logging.set_verbosity_info()
-        logger = logging.get_logger("transformers.utils.hub")
-        with CaptureLogger(logger) as cl:
-            model.push_to_hub("test-model-tf", use_auth_token=self._token)
-        logging.set_verbosity_warning()
-        # Check the model card was created and uploaded.
-        self.assertIn("Uploading the following files to __DUMMY_TRANSFORMERS_USER__/test-model-tf", cl.out)
-
-        new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
-        models_equal = True
-        for p1, p2 in zip(model.weights, new_model.weights):
-            if not tf.math.reduce_all(p1 == p2):
-                models_equal = False
-                break
-        self.assertTrue(models_equal)
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-model-tf")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, repo_id="test-model-tf", push_to_hub=True, use_auth_token=self._token)
-
-        new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
-        models_equal = True
-        for p1, p2 in zip(model.weights, new_model.weights):
-            if not tf.math.reduce_all(p1 == p2):
-                models_equal = False
-                break
-        self.assertTrue(models_equal)
-
-    @is_pt_tf_cross_test
-    def test_push_to_hub_callback(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = TFBertForMaskedLM(config)
-        model.compile()
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            push_to_hub_callback = PushToHubCallback(
-                output_dir=tmp_dir,
-                hub_model_id="test-model-tf-callback",
-                hub_token=self._token,
-            )
-            model.fit(model.dummy_inputs, model.dummy_inputs, epochs=1, callbacks=[push_to_hub_callback])
-
-        new_model = TFBertForMaskedLM.from_pretrained(f"{USER}/test-model-tf-callback")
-        models_equal = True
-        for p1, p2 in zip(model.weights, new_model.weights):
-            if not tf.math.reduce_all(p1 == p2):
-                models_equal = False
-                break
-        self.assertTrue(models_equal)
-
-        tf_push_to_hub_params = dict(inspect.signature(TFPreTrainedModel.push_to_hub).parameters)
-        tf_push_to_hub_params.pop("base_model_card_args")
-        pt_push_to_hub_params = dict(inspect.signature(PreTrainedModel.push_to_hub).parameters)
-        pt_push_to_hub_params.pop("deprecated_kwargs")
-        self.assertDictEaual(tf_push_to_hub_params, pt_push_to_hub_params)
-
-    def test_push_to_hub_in_organization(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = TFBertModel(config)
-        # Make sure model is properly initialized
-        model.build()
-
-        model.push_to_hub("valid_org/test-model-tf-org", use_auth_token=self._token)
-
-        new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org")
-        models_equal = True
-        for p1, p2 in zip(model.weights, new_model.weights):
-            if not tf.math.reduce_all(p1 == p2):
-                models_equal = False
-                break
-        self.assertTrue(models_equal)
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-model-tf-org")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                tmp_dir, push_to_hub=True, use_auth_token=self._token, repo_id="valid_org/test-model-tf-org"
-            )
-
-        new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org")
-        models_equal = True
-        for p1, p2 in zip(model.weights, new_model.weights):
-            if not tf.math.reduce_all(p1 == p2):
-                models_equal = False
-                break
-        self.assertTrue(models_equal)
diff --git a/tests/test_modeling_tf_utils.py b/tests/test_modeling_tf_utils.py
new file mode 100644
index 0000000000..862a2cffa8
--- /dev/null
+++ b/tests/test_modeling_tf_utils.py
@@ -0,0 +1,627 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from __future__ import annotations
+
+import inspect
+import json
+import os
+import random
+import tempfile
+import unittest
+import unittest.mock as mock
+
+from huggingface_hub import HfFolder, Repository, delete_repo
+from huggingface_hub.file_download import http_get
+from requests.exceptions import HTTPError
+
+from transformers import is_tf_available, is_torch_available
+from transformers.configuration_utils import PretrainedConfig
+from transformers.testing_utils import (  # noqa: F401
+    TOKEN,
+    USER,
+    CaptureLogger,
+    _tf_gpu_memory_limit,
+    is_pt_tf_cross_test,
+    is_staging_test,
+    require_safetensors,
+    require_tf,
+    slow,
+)
+from transformers.utils import SAFE_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_tf_available():
+    import h5py
+    import numpy as np
+    import tensorflow as tf
+
+    from transformers import (
+        BertConfig,
+        PreTrainedModel,
+        PushToHubCallback,
+        RagRetriever,
+        TFBertForMaskedLM,
+        TFBertForSequenceClassification,
+        TFBertModel,
+        TFPreTrainedModel,
+        TFRagModel,
+    )
+    from transformers.modeling_tf_utils import tf_shard_checkpoint, unpack_inputs
+    from transformers.tf_utils import stable_softmax
+
+    tf.config.experimental.enable_tensor_float_32_execution(False)
+
+    if _tf_gpu_memory_limit is not None:
+        gpus = tf.config.list_physical_devices("GPU")
+        for gpu in gpus:
+            # Restrict TensorFlow to only allocate x GB of memory on the GPUs
+            try:
+                tf.config.set_logical_device_configuration(
+                    gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)]
+                )
+                logical_gpus = tf.config.list_logical_devices("GPU")
+                print("Logical GPUs", logical_gpus)
+            except RuntimeError as e:
+                # Virtual devices must be set before GPUs have been initialized
+                print(e)
+
+if is_torch_available():
+    from transformers import BertModel
+
+
+@require_tf
+class TFModelUtilsTest(unittest.TestCase):
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_load_from_one_file(self):
+        try:
+            tmp_file = tempfile.mktemp()
+            with open(tmp_file, "wb") as f:
+                http_get("https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/tf_model.h5", f)
+
+            config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+            _ = TFBertModel.from_pretrained(tmp_file, config=config)
+        finally:
+            os.remove(tmp_file)
+
+    def test_legacy_load_from_url(self):
+        # This test is for deprecated behavior and can be removed in v5
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+        _ = TFBertModel.from_pretrained(
+            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/tf_model.h5", config=config
+        )
+
+    # tests whether the unpack_inputs function behaves as expected
+    def test_unpack_inputs(self):
+        class DummyModel:
+            def __init__(self):
+                config_kwargs = {"output_attentions": False, "output_hidden_states": False, "return_dict": False}
+                self.config = PretrainedConfig(**config_kwargs)
+                self.main_input_name = "input_ids"
+
+            @unpack_inputs
+            def call(
+                self,
+                input_ids=None,
+                past_key_values=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+            ):
+                return input_ids, past_key_values, output_attentions, output_hidden_states, return_dict
+
+            @unpack_inputs
+            def foo(self, pixel_values, output_attentions=None, output_hidden_states=None, return_dict=None):
+                return pixel_values, output_attentions, output_hidden_states, return_dict
+
+        dummy_model = DummyModel()
+        input_ids = tf.constant([0, 1, 2, 3], dtype=tf.int32)
+        past_key_values = tf.constant([4, 5, 6, 7], dtype=tf.int32)
+        pixel_values = tf.constant([8, 9, 10, 11], dtype=tf.int32)
+
+        # test case 1: Pass inputs as keyword arguments; Booleans are inherited from the config.
+        output = dummy_model.call(input_ids=input_ids, past_key_values=past_key_values)
+        tf.debugging.assert_equal(output[0], input_ids)
+        tf.debugging.assert_equal(output[1], past_key_values)
+        self.assertFalse(output[2])
+        self.assertFalse(output[3])
+        self.assertFalse(output[4])
+
+        # test case 2: Same as above, but with positional arguments.
+        output = dummy_model.call(input_ids, past_key_values)
+        tf.debugging.assert_equal(output[0], input_ids)
+        tf.debugging.assert_equal(output[1], past_key_values)
+        self.assertFalse(output[2])
+        self.assertFalse(output[3])
+        self.assertFalse(output[4])
+
+        # test case 3: We can also pack everything in the first input.
+        output = dummy_model.call(input_ids={"input_ids": input_ids, "past_key_values": past_key_values})
+        tf.debugging.assert_equal(output[0], input_ids)
+        tf.debugging.assert_equal(output[1], past_key_values)
+        self.assertFalse(output[2])
+        self.assertFalse(output[3])
+        self.assertFalse(output[4])
+
+        # test case 4: Explicit boolean arguments should override the config.
+        output = dummy_model.call(
+            input_ids=input_ids, past_key_values=past_key_values, output_attentions=False, return_dict=True
+        )
+        tf.debugging.assert_equal(output[0], input_ids)
+        tf.debugging.assert_equal(output[1], past_key_values)
+        self.assertFalse(output[2])
+        self.assertFalse(output[3])
+        self.assertTrue(output[4])
+
+        # test case 5: Unexpected arguments should raise an exception.
+        with self.assertRaises(ValueError):
+            output = dummy_model.call(input_ids=input_ids, past_key_values=past_key_values, foo="bar")
+
+        # test case 6: the decorator is independent from `main_input_name` -- it treats the first argument of the
+        # decorated function as its main input.
+        output = dummy_model.foo(pixel_values=pixel_values)
+        tf.debugging.assert_equal(output[0], pixel_values)
+        self.assertFalse(output[1])
+        self.assertFalse(output[2])
+        self.assertFalse(output[3])
+
+    # Tests whether the stable softmax is stable on CPU, with and without XLA
+    def test_xla_stable_softmax(self):
+        large_penalty = -1e9
+        n_tokens = 10
+        batch_size = 8
+
+        def masked_softmax(x, boolean_mask):
+            numerical_mask = (1.0 - tf.cast(boolean_mask, dtype=tf.float32)) * large_penalty
+            masked_x = x + numerical_mask
+            return stable_softmax(masked_x)
+
+        xla_masked_softmax = tf.function(masked_softmax, jit_compile=True)
+        xla_stable_softmax = tf.function(stable_softmax, jit_compile=True)
+        x = tf.random.normal((batch_size, n_tokens))
+
+        # Same outcome regardless of the boolean mask here
+        masked_tokens = random.randint(0, n_tokens)
+        boolean_mask = tf.convert_to_tensor([[1] * (n_tokens - masked_tokens) + [0] * masked_tokens], dtype=tf.int32)
+
+        # We can randomly mask a random numerical input OUTSIDE XLA
+        numerical_mask = (1.0 - tf.cast(boolean_mask, dtype=tf.float32)) * large_penalty
+        masked_x = x + numerical_mask
+        xla_out = xla_stable_softmax(masked_x)
+        out = stable_softmax(masked_x)
+        assert tf.experimental.numpy.allclose(xla_out, out)
+
+        # The stable softmax has the same output as the original softmax
+        unstable_out = tf.nn.softmax(masked_x)
+        assert tf.experimental.numpy.allclose(unstable_out, out)
+
+        # We can randomly mask a random numerical input INSIDE XLA
+        xla_out = xla_masked_softmax(x, boolean_mask)
+        out = masked_softmax(x, boolean_mask)
+        assert tf.experimental.numpy.allclose(xla_out, out)
+
+    def test_checkpoint_sharding_from_hub(self):
+        model = TFBertModel.from_pretrained("ArthurZ/tiny-random-bert-sharded")
+        # the model above is the same as the model below, just a sharded version.
+        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        for p1, p2 in zip(model.weights, ref_model.weights):
+            assert np.allclose(p1.numpy(), p2.numpy())
+
+    def test_sharded_checkpoint_with_prefix(self):
+        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert", load_weight_prefix="a/b")
+        sharded_model = TFBertModel.from_pretrained("ArthurZ/tiny-random-bert-sharded", load_weight_prefix="a/b")
+        for p1, p2 in zip(model.weights, sharded_model.weights):
+            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+            self.assertTrue(p1.name.startswith("a/b/"))
+            self.assertTrue(p2.name.startswith("a/b/"))
+
+    def test_sharded_checkpoint_transfer(self):
+        # If this doesn't throw an error then the test passes
+        TFBertForSequenceClassification.from_pretrained("ArthurZ/tiny-random-bert-sharded")
+
+    @is_pt_tf_cross_test
+    def test_checkpoint_sharding_local_from_pt(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            _ = Repository(local_dir=tmp_dir, clone_from="hf-internal-testing/tiny-random-bert-sharded")
+            model = TFBertModel.from_pretrained(tmp_dir, from_pt=True)
+            # the model above is the same as the model below, just a sharded pytorch version.
+            ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+            for p1, p2 in zip(model.weights, ref_model.weights):
+                assert np.allclose(p1.numpy(), p2.numpy())
+
+    @is_pt_tf_cross_test
+    def test_checkpoint_loading_with_prefix_from_pt(self):
+        model = TFBertModel.from_pretrained(
+            "hf-internal-testing/tiny-random-bert", from_pt=True, load_weight_prefix="a/b"
+        )
+        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert", from_pt=True)
+        for p1, p2 in zip(model.weights, ref_model.weights):
+            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+            self.assertTrue(p1.name.startswith("a/b/"))
+
+    @is_pt_tf_cross_test
+    def test_checkpoint_sharding_hub_from_pt(self):
+        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded", from_pt=True)
+        # the model above is the same as the model below, just a sharded pytorch version.
+        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        for p1, p2 in zip(model.weights, ref_model.weights):
+            assert np.allclose(p1.numpy(), p2.numpy())
+
+    def test_shard_checkpoint(self):
+        # This is the model we will use, total size 340,000 bytes.
+        model = tf.keras.Sequential(
+            [
+                tf.keras.layers.Dense(200, use_bias=False),  # size 80,000
+                tf.keras.layers.Dense(200, use_bias=False),  # size 160,000
+                tf.keras.layers.Dense(100, use_bias=False),  # size 80,000
+                tf.keras.layers.Dense(50, use_bias=False),  # size 20,000
+            ]
+        )
+        inputs = tf.zeros((1, 100), dtype=tf.float32)
+        model(inputs)
+        weights = model.weights
+        weights_dict = {w.name: w for w in weights}
+        with self.subTest("No shard when max size is bigger than model size"):
+            shards, index = tf_shard_checkpoint(weights)
+            self.assertIsNone(index)
+            self.assertDictEqual(shards, {TF2_WEIGHTS_NAME: weights})
+
+        with self.subTest("Test sharding, no weights bigger than max size"):
+            shards, index = tf_shard_checkpoint(weights, max_shard_size="300kB")
+            # Split is first two layers then last two.
+            self.assertDictEqual(
+                index,
+                {
+                    "metadata": {"total_size": 340000},
+                    "weight_map": {
+                        "dense/kernel:0": "tf_model-00001-of-00002.h5",
+                        "dense_1/kernel:0": "tf_model-00001-of-00002.h5",
+                        "dense_2/kernel:0": "tf_model-00002-of-00002.h5",
+                        "dense_3/kernel:0": "tf_model-00002-of-00002.h5",
+                    },
+                },
+            )
+
+            shard1 = [weights_dict["dense/kernel:0"], weights_dict["dense_1/kernel:0"]]
+            shard2 = [weights_dict["dense_2/kernel:0"], weights_dict["dense_3/kernel:0"]]
+            self.assertDictEqual(shards, {"tf_model-00001-of-00002.h5": shard1, "tf_model-00002-of-00002.h5": shard2})
+
+        with self.subTest("Test sharding with weights bigger than max size"):
+            shards, index = tf_shard_checkpoint(weights, max_shard_size="100kB")
+            # Split is first layer, second layer then last 2.
+            self.assertDictEqual(
+                index,
+                {
+                    "metadata": {"total_size": 340000},
+                    "weight_map": {
+                        "dense/kernel:0": "tf_model-00001-of-00003.h5",
+                        "dense_1/kernel:0": "tf_model-00002-of-00003.h5",
+                        "dense_2/kernel:0": "tf_model-00003-of-00003.h5",
+                        "dense_3/kernel:0": "tf_model-00003-of-00003.h5",
+                    },
+                },
+            )
+
+            shard1 = [weights_dict["dense/kernel:0"]]
+            shard2 = [weights_dict["dense_1/kernel:0"]]
+            shard3 = [weights_dict["dense_2/kernel:0"], weights_dict["dense_3/kernel:0"]]
+            self.assertDictEqual(
+                shards,
+                {
+                    "tf_model-00001-of-00003.h5": shard1,
+                    "tf_model-00002-of-00003.h5": shard2,
+                    "tf_model-00003-of-00003.h5": shard3,
+                },
+            )
+
+    @slow
+    def test_special_layer_name_sharding(self):
+        retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
+        model = TFRagModel.from_pretrained("facebook/rag-token-nq", retriever=retriever)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            for max_size in ["150kB", "150kiB", "200kB", "200kiB"]:
+                model.save_pretrained(tmp_dir, max_shard_size=max_size)
+                ref_model = TFRagModel.from_pretrained(tmp_dir, retriever=retriever)
+                for p1, p2 in zip(model.weights, ref_model.weights):
+                    assert np.allclose(p1.numpy(), p2.numpy())
+
+    def test_checkpoint_sharding_local(self):
+        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # We use the same folder for various sizes to make sure a new save erases the old checkpoint.
+            for max_size in ["150kB", "150kiB", "200kB", "200kiB"]:
+                model.save_pretrained(tmp_dir, max_shard_size=max_size)
+
+                # Get each shard file and its size
+                shard_to_size = {}
+                for shard in os.listdir(tmp_dir):
+                    if shard.endswith(".h5"):
+                        shard_file = os.path.join(tmp_dir, shard)
+                        shard_to_size[shard_file] = os.path.getsize(shard_file)
+
+                index_file = os.path.join(tmp_dir, TF2_WEIGHTS_INDEX_NAME)
+                # Check there is an index but no regular weight file
+                self.assertTrue(os.path.isfile(index_file))
+                self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_NAME)))
+
+                # Check a file is bigger than max_size only when it has a single weight
+                for shard_file, size in shard_to_size.items():
+                    if max_size.endswith("kiB"):
+                        max_size_int = int(max_size[:-3]) * 2**10
+                    else:
+                        max_size_int = int(max_size[:-2]) * 10**3
+                    # Note: pickle adds some junk so the weight of the file can end up being slightly bigger than
+                    # the size asked for (since we count parameters)
+                    if size >= max_size_int + 50000:
+                        with h5py.File(shard_file, "r") as state_file:
+                            self.assertEqual(len(state_file), 1)
+
+                # Check the index and the shard files found match
+                with open(index_file, "r", encoding="utf-8") as f:
+                    index = json.loads(f.read())
+
+                all_shards = set(index["weight_map"].values())
+                shards_found = {f for f in os.listdir(tmp_dir) if f.endswith(".h5")}
+                self.assertSetEqual(all_shards, shards_found)
+
+                # Finally, check the model can be reloaded
+                new_model = TFBertModel.from_pretrained(tmp_dir)
+
+                model.build()
+                new_model.build()
+
+                for p1, p2 in zip(model.weights, new_model.weights):
+                    self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+
+    @slow
+    def test_save_pretrained_signatures(self):
+        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Short custom TF signature function.
+        # `input_signature` is specific to BERT.
+        @tf.function(
+            input_signature=[
+                [
+                    tf.TensorSpec([None, None], tf.int32, name="input_ids"),
+                    tf.TensorSpec([None, None], tf.int32, name="token_type_ids"),
+                    tf.TensorSpec([None, None], tf.int32, name="attention_mask"),
+                ]
+            ]
+        )
+        def serving_fn(input):
+            return model(input)
+
+        # Using default signature (default behavior) overrides 'serving_default'
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, saved_model=True, signatures=None)
+            model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1")
+            self.assertTrue("serving_default" in list(model_loaded.signatures.keys()))
+
+        # Providing custom signature function
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, saved_model=True, signatures={"custom_signature": serving_fn})
+            model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1")
+            self.assertTrue("custom_signature" in list(model_loaded.signatures.keys()))
+
+        # Providing multiple custom signature function
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(
+                tmp_dir,
+                saved_model=True,
+                signatures={"custom_signature_1": serving_fn, "custom_signature_2": serving_fn},
+            )
+            model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1")
+            self.assertTrue("custom_signature_1" in list(model_loaded.signatures.keys()))
+            self.assertTrue("custom_signature_2" in list(model_loaded.signatures.keys()))
+
+    @require_safetensors
+    def test_safetensors_save_and_load(self):
+        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, safe_serialization=True)
+            # No tf_model.h5 file, only a model.safetensors
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_NAME)))
+
+            new_model = TFBertModel.from_pretrained(tmp_dir)
+
+            # Check models are equal
+            for p1, p2 in zip(model.weights, new_model.weights):
+                self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+
+    @is_pt_tf_cross_test
+    def test_safetensors_save_and_load_pt_to_tf(self):
+        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        pt_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            pt_model.save_pretrained(tmp_dir, safe_serialization=True)
+            # Check we have a model.safetensors file
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
+
+            new_model = TFBertModel.from_pretrained(tmp_dir)
+
+            # Check models are equal
+            for p1, p2 in zip(model.weights, new_model.weights):
+                self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+
+    @require_safetensors
+    def test_safetensors_load_from_hub(self):
+        tf_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Can load from the TF-formatted checkpoint
+        safetensors_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors-tf")
+
+        # Check models are equal
+        for p1, p2 in zip(safetensors_model.weights, tf_model.weights):
+            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+
+        # Can load from the PyTorch-formatted checkpoint
+        safetensors_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors")
+
+        # Check models are equal
+        for p1, p2 in zip(safetensors_model.weights, tf_model.weights):
+            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+
+
+@require_tf
+@is_staging_test
+class TFModelPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-model-tf")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-model-tf-callback")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-model-tf-org")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = TFBertModel(config)
+        # Make sure model is properly initialized
+        model.build()
+
+        logging.set_verbosity_info()
+        logger = logging.get_logger("transformers.utils.hub")
+        with CaptureLogger(logger) as cl:
+            model.push_to_hub("test-model-tf", use_auth_token=self._token)
+        logging.set_verbosity_warning()
+        # Check the model card was created and uploaded.
+        self.assertIn("Uploading the following files to __DUMMY_TRANSFORMERS_USER__/test-model-tf", cl.out)
+
+        new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
+        models_equal = True
+        for p1, p2 in zip(model.weights, new_model.weights):
+            if not tf.math.reduce_all(p1 == p2):
+                models_equal = False
+                break
+        self.assertTrue(models_equal)
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-model-tf")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, repo_id="test-model-tf", push_to_hub=True, use_auth_token=self._token)
+
+        new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
+        models_equal = True
+        for p1, p2 in zip(model.weights, new_model.weights):
+            if not tf.math.reduce_all(p1 == p2):
+                models_equal = False
+                break
+        self.assertTrue(models_equal)
+
+    @is_pt_tf_cross_test
+    def test_push_to_hub_callback(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = TFBertForMaskedLM(config)
+        model.compile()
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            push_to_hub_callback = PushToHubCallback(
+                output_dir=tmp_dir,
+                hub_model_id="test-model-tf-callback",
+                hub_token=self._token,
+            )
+            model.fit(model.dummy_inputs, model.dummy_inputs, epochs=1, callbacks=[push_to_hub_callback])
+
+        new_model = TFBertForMaskedLM.from_pretrained(f"{USER}/test-model-tf-callback")
+        models_equal = True
+        for p1, p2 in zip(model.weights, new_model.weights):
+            if not tf.math.reduce_all(p1 == p2):
+                models_equal = False
+                break
+        self.assertTrue(models_equal)
+
+        tf_push_to_hub_params = dict(inspect.signature(TFPreTrainedModel.push_to_hub).parameters)
+        tf_push_to_hub_params.pop("base_model_card_args")
+        pt_push_to_hub_params = dict(inspect.signature(PreTrainedModel.push_to_hub).parameters)
+        pt_push_to_hub_params.pop("deprecated_kwargs")
+        self.assertDictEaual(tf_push_to_hub_params, pt_push_to_hub_params)
+
+    def test_push_to_hub_in_organization(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = TFBertModel(config)
+        # Make sure model is properly initialized
+        model.build()
+
+        model.push_to_hub("valid_org/test-model-tf-org", use_auth_token=self._token)
+
+        new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org")
+        models_equal = True
+        for p1, p2 in zip(model.weights, new_model.weights):
+            if not tf.math.reduce_all(p1 == p2):
+                models_equal = False
+                break
+        self.assertTrue(models_equal)
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-model-tf-org")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(
+                tmp_dir, push_to_hub=True, use_auth_token=self._token, repo_id="valid_org/test-model-tf-org"
+            )
+
+        new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org")
+        models_equal = True
+        for p1, p2 in zip(model.weights, new_model.weights):
+            if not tf.math.reduce_all(p1 == p2):
+                models_equal = False
+                break
+        self.assertTrue(models_equal)
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
new file mode 100755
index 0000000000..760d6c6d54
--- /dev/null
+++ b/tests/test_modeling_utils.py
@@ -0,0 +1,976 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import json
+import os
+import os.path
+import sys
+import tempfile
+import unittest
+import unittest.mock as mock
+from pathlib import Path
+
+from huggingface_hub import HfFolder, delete_repo
+from huggingface_hub.file_download import http_get
+from pytest import mark
+from requests.exceptions import HTTPError
+
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    PretrainedConfig,
+    is_torch_available,
+    logging,
+)
+from transformers.testing_utils import (
+    TOKEN,
+    USER,
+    CaptureLogger,
+    TestCasePlus,
+    is_staging_test,
+    require_accelerate,
+    require_safetensors,
+    require_torch,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    require_usr_bin_time,
+    slow,
+)
+from transformers.utils import (
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_NAME,
+    WEIGHTS_INDEX_NAME,
+    WEIGHTS_NAME,
+)
+
+
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_configuration import CustomConfig, NoSuperInitConfig  # noqa E402
+
+
+if is_torch_available():
+    import torch
+    from test_module.custom_modeling import CustomModel, NoSuperInitModel
+    from torch import nn
+
+    from transformers import (
+        BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        AutoModelForCausalLM,
+        AutoTokenizer,
+        BertConfig,
+        BertModel,
+        CLIPTextModel,
+        PreTrainedModel,
+        T5Config,
+        T5ForConditionalGeneration,
+    )
+    from transformers.modeling_utils import shard_checkpoint
+
+    # Fake pretrained models for tests
+    class BaseModel(PreTrainedModel):
+        config_class = PretrainedConfig
+
+        def __init__(self, config):
+            super().__init__(config)
+            self.linear = nn.Linear(4, 5)
+            self.linear_2 = nn.Linear(5, 6)
+
+        def forward(self, x):
+            return self.linear_2(self.linear(x))
+
+    class ModelWithHead(PreTrainedModel):
+        base_model_prefix = "base"
+        config_class = PretrainedConfig
+
+        def _init_weights(self, module):
+            pass
+
+        def __init__(self, config):
+            super().__init__(config)
+            self.base = BaseModel(config)
+            # linear is a common name between Base and Head on purpose.
+            self.linear = nn.Linear(6, 3)
+            self.linear2 = nn.Linear(3, 5)
+
+        def forward(self, x):
+            return self.linear2(self.linear(self.base(x)))
+
+
+TINY_T5 = "patrickvonplaten/t5-tiny-random"
+TINY_BERT_FOR_TOKEN_CLASSIFICATION = "hf-internal-testing/tiny-bert-for-token-classification"
+
+
+def check_models_equal(model1, model2):
+    models_are_equal = True
+    for model1_p, model2_p in zip(model1.parameters(), model2.parameters()):
+        if model1_p.data.ne(model2_p.data).sum() > 0:
+            models_are_equal = False
+
+    return models_are_equal
+
+
+@require_torch
+class ModelUtilsTest(TestCasePlus):
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = BertConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, PretrainedConfig)
+
+            model = BertModel.from_pretrained(model_name)
+            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, PreTrainedModel)
+
+            self.assertEqual(len(loading_info["missing_keys"]), 0)
+            self.assertEqual(len(loading_info["unexpected_keys"]), 8)
+            self.assertEqual(len(loading_info["mismatched_keys"]), 0)
+            self.assertEqual(len(loading_info["error_msgs"]), 0)
+
+            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+
+            # Not sure this is the intended behavior. TODO fix Lysandre & Thom
+            config.name_or_path = model_name
+
+            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            self.assertEqual(model.config.output_hidden_states, True)
+            self.assertEqual(model.config, config)
+
+    def test_model_from_pretrained_subfolder(self):
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+        model = BertModel(config)
+
+        subfolder = "bert"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(os.path.join(tmp_dir, subfolder))
+
+            with self.assertRaises(OSError):
+                _ = BertModel.from_pretrained(tmp_dir)
+
+            model_loaded = BertModel.from_pretrained(tmp_dir, subfolder=subfolder)
+
+        self.assertTrue(check_models_equal(model, model_loaded))
+
+    def test_model_from_pretrained_subfolder_sharded(self):
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+        model = BertModel(config)
+
+        subfolder = "bert"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(os.path.join(tmp_dir, subfolder), max_shard_size="10KB")
+
+            with self.assertRaises(OSError):
+                _ = BertModel.from_pretrained(tmp_dir)
+
+            model_loaded = BertModel.from_pretrained(tmp_dir, subfolder=subfolder)
+
+        self.assertTrue(check_models_equal(model, model_loaded))
+
+    def test_model_from_pretrained_hub_subfolder(self):
+        subfolder = "bert"
+        model_id = "hf-internal-testing/tiny-random-bert-subfolder"
+        with self.assertRaises(OSError):
+            _ = BertModel.from_pretrained(model_id)
+
+        model = BertModel.from_pretrained(model_id, subfolder=subfolder)
+
+        self.assertIsNotNone(model)
+
+    def test_model_from_pretrained_hub_subfolder_sharded(self):
+        subfolder = "bert"
+        model_id = "hf-internal-testing/tiny-random-bert-sharded-subfolder"
+        with self.assertRaises(OSError):
+            _ = BertModel.from_pretrained(model_id)
+
+        model = BertModel.from_pretrained(model_id, subfolder=subfolder)
+
+        self.assertIsNotNone(model)
+
+    def test_model_from_pretrained_with_different_pretrained_model_name(self):
+        model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
+        self.assertIsNotNone(model)
+
+        logger = logging.get_logger("transformers.configuration_utils")
+        with CaptureLogger(logger) as cl:
+            BertModel.from_pretrained(TINY_T5)
+        self.assertTrue("You are using a model of type t5 to instantiate a model of type bert" in cl.out)
+
+    def test_model_from_config_torch_dtype(self):
+        # test that the model can be instantiated with dtype of user's choice - as long as it's a
+        # float dtype. To make it happen config.torch_dtype needs to be set before instantiating the
+        # model from the config object.
+
+        config = T5Config.from_pretrained(TINY_T5)
+        model = AutoModel.from_config(config)
+        # XXX: isn't supported
+        # model = T5ForConditionalGeneration.from_config(config)
+        self.assertEqual(model.dtype, torch.float32)
+
+        model = AutoModel.from_config(config, torch_dtype=torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+
+        # torch.set_default_dtype() supports only float dtypes, so will fail with non-float type
+        with self.assertRaises(ValueError):
+            model = AutoModel.from_config(config, torch_dtype=torch.int64)
+
+    def test_model_from_pretrained_torch_dtype(self):
+        # test that the model can be instantiated with dtype of either
+        # 1. explicit from_pretrained's torch_dtype argument
+        # 2. via autodiscovery by looking at model weights (torch_dtype="auto")
+        # so if a model.half() was saved, we want it to be instantiated as such.
+        #
+        # test an explicit model class, but also AutoModel separately as the latter goes through a different code path
+        model_path = self.get_auto_remove_tmp_dir()
+
+        # baseline - we know TINY_T5 is fp32 model
+        model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
+        self.assertEqual(model.dtype, torch.float32)
+
+        def remove_torch_dtype(model_path):
+            file = f"{model_path}/config.json"
+            with open(file, "r", encoding="utf-8") as f:
+                s = json.load(f)
+            s.pop("torch_dtype")
+            with open(file, "w", encoding="utf-8") as f:
+                json.dump(s, f)
+
+        # test the default fp32 save_pretrained => from_pretrained cycle
+        model.save_pretrained(model_path)
+        model = T5ForConditionalGeneration.from_pretrained(model_path)
+        self.assertEqual(model.dtype, torch.float32)
+        # 1. test torch_dtype="auto" via `config.torch_dtype`
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float32)
+        # 2. test torch_dtype="auto" via auto-derivation
+        # now remove the torch_dtype entry from config.json and try "auto" again which should
+        # perform auto-derivation from weights
+        remove_torch_dtype(model_path)
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float32)
+
+        # test forced loading in fp16 (even though the weights are in fp32)
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+
+        # test fp16 save_pretrained, loaded with auto-detection
+        model = model.half()
+        model.save_pretrained(model_path)
+        # 1. test torch_dtype="auto" via `config.torch_dtype`
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
+        self.assertEqual(model.config.torch_dtype, torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+        # tests `config.torch_dtype` saving
+        with open(f"{model_path}/config.json") as f:
+            config_dict = json.load(f)
+        self.assertEqual(config_dict["torch_dtype"], "float16")
+        # 2. test torch_dtype="auto" via auto-derivation
+        # now same with using config info
+        remove_torch_dtype(model_path)
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float16)
+
+        # 3. now retest that AutoModel behaves the same wrt torch_dtype="auto" as T5ForConditionalGeneration
+        model = AutoModel.from_pretrained(model_path, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float16)
+
+        # test fp16 save_pretrained, loaded with the explicit fp16
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+
+        # test AutoModel separately as it goes through a different path
+        # test auto-detection - as currently TINY_T5 doesn't have torch_dtype entry
+        model = AutoModel.from_pretrained(TINY_T5, torch_dtype="auto")
+        # test that the config object didn't get polluted with torch_dtype="auto"
+        # there was a bug that after this call we ended up with config.torch_dtype=="auto"
+        self.assertNotEqual(model.config.torch_dtype, "auto")
+        # now test the outcome
+        self.assertEqual(model.dtype, torch.float32)
+        model = AutoModel.from_pretrained(TINY_T5, torch_dtype=torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+
+        # test model whose first param is not of a floating type, but int
+        model = AutoModel.from_pretrained(TINY_BERT_FOR_TOKEN_CLASSIFICATION, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float32)
+
+    def test_no_super_init_config_and_model(self):
+        config = NoSuperInitConfig(attribute=32)
+        model = NoSuperInitModel(config)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+
+            new_model = NoSuperInitModel.from_pretrained(tmp_dir)
+
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+    def test_shard_checkpoint(self):
+        # This is the model we will use, total size 340,000 bytes.
+        model = torch.nn.Sequential(
+            torch.nn.Linear(100, 200, bias=False),  # size 80,000
+            torch.nn.Linear(200, 200, bias=False),  # size 160,000
+            torch.nn.Linear(200, 100, bias=False),  # size 80,000
+            torch.nn.Linear(100, 50, bias=False),  # size 20,000
+        )
+        state_dict = model.state_dict()
+
+        with self.subTest("No shard when max size is bigger than model size"):
+            shards, index = shard_checkpoint(state_dict)
+            self.assertIsNone(index)
+            self.assertDictEqual(shards, {WEIGHTS_NAME: state_dict})
+
+        with self.subTest("Test sharding, no weights bigger than max size"):
+            shards, index = shard_checkpoint(state_dict, max_shard_size="300kB")
+            # Split is first two layers then last two.
+            self.assertDictEqual(
+                index,
+                {
+                    "metadata": {"total_size": 340000},
+                    "weight_map": {
+                        "0.weight": "pytorch_model-00001-of-00002.bin",
+                        "1.weight": "pytorch_model-00001-of-00002.bin",
+                        "2.weight": "pytorch_model-00002-of-00002.bin",
+                        "3.weight": "pytorch_model-00002-of-00002.bin",
+                    },
+                },
+            )
+
+            shard1 = {"0.weight": state_dict["0.weight"], "1.weight": state_dict["1.weight"]}
+            shard2 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
+            self.assertDictEqual(
+                shards, {"pytorch_model-00001-of-00002.bin": shard1, "pytorch_model-00002-of-00002.bin": shard2}
+            )
+
+        with self.subTest("Test sharding with weights bigger than max size"):
+            shards, index = shard_checkpoint(state_dict, max_shard_size="100kB")
+            # Split is first layer, second layer then last 2.
+            self.assertDictEqual(
+                index,
+                {
+                    "metadata": {"total_size": 340000},
+                    "weight_map": {
+                        "0.weight": "pytorch_model-00001-of-00003.bin",
+                        "1.weight": "pytorch_model-00002-of-00003.bin",
+                        "2.weight": "pytorch_model-00003-of-00003.bin",
+                        "3.weight": "pytorch_model-00003-of-00003.bin",
+                    },
+                },
+            )
+
+            shard1 = {"0.weight": state_dict["0.weight"]}
+            shard2 = {"1.weight": state_dict["1.weight"]}
+            shard3 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
+            self.assertDictEqual(
+                shards,
+                {
+                    "pytorch_model-00001-of-00003.bin": shard1,
+                    "pytorch_model-00002-of-00003.bin": shard2,
+                    "pytorch_model-00003-of-00003.bin": shard3,
+                },
+            )
+
+    def test_checkpoint_sharding_local(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # We use the same folder for various sizes to make sure a new save erases the old checkpoint.
+            for max_size in ["50kB", "50kiB", "100kB", "100kiB", "200kB", "200kiB"]:
+                model.save_pretrained(tmp_dir, max_shard_size=max_size)
+
+                # Get each shard file and its size
+                shard_to_size = {}
+                for shard in os.listdir(tmp_dir):
+                    if shard.endswith(".bin"):
+                        shard_file = os.path.join(tmp_dir, shard)
+                        shard_to_size[shard_file] = os.path.getsize(shard_file)
+
+                index_file = os.path.join(tmp_dir, WEIGHTS_INDEX_NAME)
+                # Check there is an index but no regular weight file
+                self.assertTrue(os.path.isfile(index_file))
+                self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
+
+                # Check a file is bigger than max_size only when it has a single weight
+                for shard_file, size in shard_to_size.items():
+                    if max_size.endswith("kiB"):
+                        max_size_int = int(max_size[:-3]) * 2**10
+                    else:
+                        max_size_int = int(max_size[:-2]) * 10**3
+                    # Note: pickle adds some junk so the weight of the file can end up being slightly bigger than
+                    # the size asked for (since we count parameters)
+                    if size >= max_size_int + 50000:
+                        state_dict = torch.load(shard_file)
+                        self.assertEqual(len(state_dict), 1)
+
+                # Check the index and the shard files found match
+                with open(index_file, "r", encoding="utf-8") as f:
+                    index = json.loads(f.read())
+
+                all_shards = set(index["weight_map"].values())
+                shards_found = {f for f in os.listdir(tmp_dir) if f.endswith(".bin")}
+                self.assertSetEqual(all_shards, shards_found)
+
+                # Finally, check the model can be reloaded
+                new_model = BertModel.from_pretrained(tmp_dir)
+                for p1, p2 in zip(model.parameters(), new_model.parameters()):
+                    self.assertTrue(torch.allclose(p1, p2))
+
+    def test_checkpoint_sharding_from_hub(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
+        # the model above is the same as the model below, just a sharded version.
+        ref_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        for p1, p2 in zip(model.parameters(), ref_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    def test_checkpoint_variant_local(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, variant="v2")
+
+            weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["bin"])
+
+            weights_file = os.path.join(tmp_dir, weights_name)
+            self.assertTrue(os.path.isfile(weights_file))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
+
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained(tmp_dir)
+
+            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
+
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    def test_checkpoint_variant_local_sharded(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, variant="v2", max_shard_size="50kB")
+
+            weights_index_name = ".".join(WEIGHTS_INDEX_NAME.split(".")[:-1] + ["v2"] + ["json"])
+            weights_index_file = os.path.join(tmp_dir, weights_index_name)
+            self.assertTrue(os.path.isfile(weights_index_file))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_INDEX_NAME)))
+
+            for i in range(1, 6):
+                weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00006"] + ["bin"])
+                weights_name_file = os.path.join(tmp_dir, weights_name)
+                self.assertTrue(os.path.isfile(weights_name_file))
+
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained(tmp_dir)
+
+            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
+
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    @require_safetensors
+    def test_checkpoint_variant_local_safe(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, variant="v2", safe_serialization=True)
+
+            weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["safetensors"])
+
+            weights_file = os.path.join(tmp_dir, weights_name)
+            self.assertTrue(os.path.isfile(weights_file))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
+
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained(tmp_dir)
+
+            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
+
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    @require_safetensors
+    def test_checkpoint_variant_local_sharded_safe(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, variant="v2", max_shard_size="50kB", safe_serialization=True)
+
+            weights_index_name = ".".join(SAFE_WEIGHTS_INDEX_NAME.split(".")[:-1] + ["v2"] + ["json"])
+            weights_index_file = os.path.join(tmp_dir, weights_index_name)
+            self.assertTrue(os.path.isfile(weights_index_file))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
+
+            for i in range(1, 6):
+                weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00006"] + ["safetensors"])
+                weights_name_file = os.path.join(tmp_dir, weights_name)
+                self.assertTrue(os.path.isfile(weights_name_file))
+
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained(tmp_dir)
+
+            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
+
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    def test_checkpoint_variant_hub(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-variant", cache_dir=tmp_dir)
+            model = BertModel.from_pretrained(
+                "hf-internal-testing/tiny-random-bert-variant", cache_dir=tmp_dir, variant="v2"
+            )
+        self.assertIsNotNone(model)
+
+    def test_checkpoint_variant_hub_sharded(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained(
+                    "hf-internal-testing/tiny-random-bert-variant-sharded", cache_dir=tmp_dir
+                )
+            model = BertModel.from_pretrained(
+                "hf-internal-testing/tiny-random-bert-variant-sharded", cache_dir=tmp_dir, variant="v2"
+            )
+        self.assertIsNotNone(model)
+
+    @require_safetensors
+    def test_checkpoint_variant_hub_safe(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-variant-safe", cache_dir=tmp_dir)
+            model = BertModel.from_pretrained(
+                "hf-internal-testing/tiny-random-bert-variant-safe", cache_dir=tmp_dir, variant="v2"
+            )
+        self.assertIsNotNone(model)
+
+    @require_safetensors
+    def test_checkpoint_variant_hub_sharded_safe(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained(
+                    "hf-internal-testing/tiny-random-bert-variant-sharded-safe", cache_dir=tmp_dir
+                )
+            model = BertModel.from_pretrained(
+                "hf-internal-testing/tiny-random-bert-variant-sharded-safe", cache_dir=tmp_dir, variant="v2"
+            )
+        self.assertIsNotNone(model)
+
+    def test_checkpoint_variant_save_load(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model = BertModel.from_pretrained(
+                "hf-internal-testing/tiny-random-bert-variant", cache_dir=tmp_dir, variant="v2"
+            )
+            weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["bin"])
+
+            model.save_pretrained(tmp_dir, variant="v2")
+            # saving will create a variant checkpoint
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, weights_name)))
+
+            model.save_pretrained(tmp_dir)
+            # saving shouldn't delete variant checkpoints
+            weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["bin"])
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, weights_name)))
+
+            # there should be a normal checkpoint
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
+
+        self.assertIsNotNone(model)
+
+    @require_accelerate
+    @mark.accelerate_tests
+    def test_from_pretrained_low_cpu_mem_usage_functional(self):
+        # test that we can use `from_pretrained(..., low_cpu_mem_usage=True)` with normal and
+        # sharded models
+
+        mnames = [
+            "hf-internal-testing/tiny-random-bert-sharded",
+            "hf-internal-testing/tiny-random-bert",
+        ]
+        for mname in mnames:
+            _ = BertModel.from_pretrained(mname, low_cpu_mem_usage=True)
+
+    @require_usr_bin_time
+    @require_accelerate
+    @mark.accelerate_tests
+    def test_from_pretrained_low_cpu_mem_usage_measured(self):
+        # test that `from_pretrained(..., low_cpu_mem_usage=True)` uses less cpu memory than default
+
+        mname = "bert-base-cased"
+
+        preamble = "from transformers import AutoModel"
+        one_liner_str = f'{preamble}; AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=False)'
+        max_rss_normal = self.python_one_liner_max_rss(one_liner_str)
+        # print(f"{max_rss_normal=}")
+
+        one_liner_str = f'{preamble};  AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=True)'
+        max_rss_low_mem = self.python_one_liner_max_rss(one_liner_str)
+        # print(f"{max_rss_low_mem=}")
+
+        diff_bytes = max_rss_normal - max_rss_low_mem
+        diff_percent = diff_bytes / max_rss_low_mem
+        # print(f"{diff_bytes=}, {diff_percent=}")
+        # ideally we would compare that the diff is close to ~1x checkpoint size in bytes, but
+        # measuring cpu memory on linux is very tricky and inconsistent, so instead let's check that
+        # it's at least 15% less cpu memory consumed
+
+        self.assertGreater(
+            diff_percent,
+            0.15,
+            "should use less CPU memory for low_cpu_mem_usage=True, "
+            f"but got max_rss_normal={max_rss_normal} and max_rss_low_mem={max_rss_low_mem}",
+        )
+
+        # if you want to compare things manually, let's first look at the size of the model in bytes
+        # model = BertModel.from_pretrained(mname, low_cpu_mem_usage=False)
+        # total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
+        # total_bytes = total_numel * 4  # 420MB
+        # Now the diff_bytes should be very close to total_bytes, but the reports are inconsistent.
+        # The easiest way to test this is to switch the model and torch.load to do all the work on
+        # gpu - that way one can measure exactly the total and peak memory used. Perhaps once we add
+        # functionality to load models directly on gpu, this test can be rewritten to use torch's
+        # cuda memory tracking and then we should be able to do a much more precise test.
+
+    @require_accelerate
+    @mark.accelerate_tests
+    @require_torch_multi_gpu
+    @slow
+    def test_model_parallelism_gpt2(self):
+        device_map = {"transformer.wte": 0, "transformer.wpe": 0, "lm_head": 0, "transformer.ln_f": 1}
+        for i in range(12):
+            device_map[f"transformer.h.{i}"] = 0 if i <= 5 else 1
+
+        model = AutoModelForCausalLM.from_pretrained("gpt2", device_map=device_map)
+
+        tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        inputs = tokenizer("Hello, my name is", return_tensors="pt")
+        output = model.generate(inputs["input_ids"].to(0))
+
+        text_output = tokenizer.decode(output[0].tolist())
+        self.assertEqual(text_output, "Hello, my name is John. I'm a writer, and I'm a writer. I'm")
+
+    @require_accelerate
+    @mark.accelerate_tests
+    @require_torch_gpu
+    def test_from_pretrained_disk_offload_task_model(self):
+        model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        device_map = {
+            "transformer.wte": 0,
+            "transformer.wpe": 0,
+            "transformer.h.0": "cpu",
+            "transformer.h.1": "cpu",
+            "transformer.h.2": "cpu",
+            "transformer.h.3": "disk",
+            "transformer.h.4": "disk",
+            "transformer.ln_f": 0,
+            "lm_head": 0,
+        }
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            inputs = torch.tensor([[1, 2, 3]]).to(0)
+
+            model.save_pretrained(tmp_dir)
+            new_model = AutoModelForCausalLM.from_pretrained(tmp_dir).to(0)
+            outputs1 = new_model.to(0)(inputs)
+
+            offload_folder = os.path.join(tmp_dir, "offload")
+            new_model_with_offload = AutoModelForCausalLM.from_pretrained(
+                tmp_dir, device_map=device_map, offload_folder=offload_folder
+            )
+            outputs2 = new_model_with_offload(inputs)
+
+            self.assertTrue(torch.allclose(outputs1.logits.cpu(), outputs2.logits.cpu()))
+
+            # With state dict temp offload
+            offload_folder = os.path.join(tmp_dir, "offload")
+            new_model_with_offload = AutoModelForCausalLM.from_pretrained(
+                tmp_dir,
+                device_map=device_map,
+                offload_folder=offload_folder,
+                offload_state_dict=True,
+            )
+            outputs2 = new_model_with_offload(inputs)
+
+            self.assertTrue(torch.allclose(outputs1.logits.cpu(), outputs2.logits.cpu()))
+
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_load_from_one_file(self):
+        try:
+            tmp_file = tempfile.mktemp()
+            with open(tmp_file, "wb") as f:
+                http_get(
+                    "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/pytorch_model.bin", f
+                )
+
+            config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+            _ = BertModel.from_pretrained(tmp_file, config=config)
+        finally:
+            os.remove(tmp_file)
+
+    def test_legacy_load_from_url(self):
+        # This test is for deprecated behavior and can be removed in v5
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+        _ = BertModel.from_pretrained(
+            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/pytorch_model.bin", config=config
+        )
+
+    @require_safetensors
+    def test_use_safetensors(self):
+        # test nice error message if no safetensor files available
+        with self.assertRaises(OSError) as env_error:
+            AutoModel.from_pretrained("hf-internal-testing/tiny-random-RobertaModel", use_safetensors=True)
+
+        self.assertTrue(
+            "model.safetensors or model.safetensors.index.json and thus cannot be loaded with `safetensors`"
+            in str(env_error.exception)
+        )
+
+        # test that error if only safetensors is available
+        with self.assertRaises(OSError) as env_error:
+            BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors", use_safetensors=False)
+
+        self.assertTrue("does not appear to have a file named pytorch_model.bin" in str(env_error.exception))
+
+        # test that only safetensors if both available and use_safetensors=False
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            CLIPTextModel.from_pretrained(
+                "hf-internal-testing/diffusers-stable-diffusion-tiny-all",
+                subfolder="text_encoder",
+                use_safetensors=False,
+                cache_dir=tmp_dir,
+            )
+
+            all_downloaded_files = glob.glob(os.path.join(tmp_dir, "*", "snapshots", "*", "*", "*"))
+            self.assertTrue(any(f.endswith("bin") for f in all_downloaded_files))
+            self.assertFalse(any(f.endswith("safetensors") for f in all_downloaded_files))
+
+        # test that no safetensors if both available and use_safetensors=True
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            CLIPTextModel.from_pretrained(
+                "hf-internal-testing/diffusers-stable-diffusion-tiny-all",
+                subfolder="text_encoder",
+                use_safetensors=True,
+                cache_dir=tmp_dir,
+            )
+
+            all_downloaded_files = glob.glob(os.path.join(tmp_dir, "*", "snapshots", "*", "*", "*"))
+            self.assertTrue(any(f.endswith("safetensors") for f in all_downloaded_files))
+            self.assertFalse(any(f.endswith("bin") for f in all_downloaded_files))
+
+    @require_safetensors
+    def test_safetensors_save_and_load(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, safe_serialization=True)
+            # No pytorch_model.bin file, only a model.safetensors
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
+
+            new_model = BertModel.from_pretrained(tmp_dir)
+
+            # Check models are equal
+            for p1, p2 in zip(model.parameters(), new_model.parameters()):
+                self.assertTrue(torch.allclose(p1, p2))
+
+    @require_safetensors
+    def test_safetensors_load_from_hub(self):
+        safetensors_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors")
+        pytorch_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Check models are equal
+        for p1, p2 in zip(safetensors_model.parameters(), pytorch_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    @require_safetensors
+    def test_safetensors_save_and_load_sharded(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, safe_serialization=True, max_shard_size="100kB")
+            # No pytorch_model.bin index file, only a model.safetensors index
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_INDEX_NAME)))
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
+            # No regular weights file
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
+
+            new_model = BertModel.from_pretrained(tmp_dir)
+
+            # Check models are equal
+            for p1, p2 in zip(model.parameters(), new_model.parameters()):
+                self.assertTrue(torch.allclose(p1, p2))
+
+    @require_safetensors
+    def test_safetensors_load_from_hub_sharded(self):
+        safetensors_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded-safetensors")
+        pytorch_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
+
+        # Check models are equal
+        for p1, p2 in zip(safetensors_model.parameters(), pytorch_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    def test_base_model_to_head_model_load(self):
+        base_model = BaseModel(PretrainedConfig())
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            base_model.save_pretrained(tmp_dir)
+
+            # Can load a base model in a model with head
+            model = ModelWithHead.from_pretrained(tmp_dir)
+            for p1, p2 in zip(model.base.parameters(), base_model.parameters()):
+                self.assertTrue(torch.allclose(p1, p2))
+
+            # It doesn't work if the state dict has a mix of keys of the head and base without prefix though.
+            base_state_dict = base_model.state_dict()
+            head_state_dict = model.state_dict()
+            base_state_dict["linear2.weight"] = head_state_dict["linear2.weight"]
+            base_state_dict["linear2.bias"] = head_state_dict["linear2.bias"]
+            torch.save(base_state_dict, os.path.join(tmp_dir, WEIGHTS_NAME))
+
+            with self.assertRaisesRegex(
+                ValueError, "The state dictionary of the model you are trying to load is corrupted."
+            ):
+                _ = ModelWithHead.from_pretrained(tmp_dir)
+
+    @require_torch_gpu
+    @slow
+    def test_pretrained_low_mem_new_config(self):
+        # Checking for 1 model(the same one which was described in the issue) .
+        model_ids = ["gpt2"]
+
+        for model_id in model_ids:
+            model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_id)
+            model_config.n_layer = 48
+            model_config.n_head = 25
+            model_config.n_embd = 1600
+            model = AutoModelForCausalLM.from_pretrained(
+                pretrained_model_name_or_path=model_id,
+                config=model_config,
+                ignore_mismatched_sizes=True,
+                torch_dtype=torch.float16,
+                low_cpu_mem_usage=True,
+            )
+            model_ref = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id)
+
+            self.assertEqual(model.__class__.__name__, model_ref.__class__.__name__)
+
+
+@require_torch
+@is_staging_test
+class ModelPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-model")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-model-org")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-dynamic-model")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = BertModel(config)
+        model.push_to_hub("test-model", use_auth_token=self._token)
+
+        new_model = BertModel.from_pretrained(f"{USER}/test-model")
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-model")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, repo_id="test-model", push_to_hub=True, use_auth_token=self._token)
+
+        new_model = BertModel.from_pretrained(f"{USER}/test-model")
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+    def test_push_to_hub_in_organization(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = BertModel(config)
+        model.push_to_hub("valid_org/test-model-org", use_auth_token=self._token)
+
+        new_model = BertModel.from_pretrained("valid_org/test-model-org")
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-model-org")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(
+                tmp_dir, push_to_hub=True, use_auth_token=self._token, repo_id="valid_org/test-model-org"
+            )
+
+        new_model = BertModel.from_pretrained("valid_org/test-model-org")
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+    def test_push_to_hub_dynamic_model(self):
+        CustomConfig.register_for_auto_class()
+        CustomModel.register_for_auto_class()
+
+        config = CustomConfig(hidden_size=32)
+        model = CustomModel(config)
+
+        model.push_to_hub("test-dynamic-model", use_auth_token=self._token)
+        # checks
+        self.assertDictEqual(
+            config.auto_map,
+            {"AutoConfig": "custom_configuration.CustomConfig", "AutoModel": "custom_modeling.CustomModel"},
+        )
+
+        new_model = AutoModel.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True)
+        # Can't make an isinstance check because the new_model is from the CustomModel class of a dynamic module
+        self.assertEqual(new_model.__class__.__name__, "CustomModel")
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True)
+        new_model = AutoModel.from_config(config, trust_remote_code=True)
+        self.assertEqual(new_model.__class__.__name__, "CustomModel")
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 53231998ee..0656e382a8 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -21,28 +21,20 @@ import os
 import pickle
 import re
 import shutil
-import sys
 import tempfile
 import traceback
 import unittest
-import unittest.mock as mock
 from collections import OrderedDict
 from itertools import takewhile
-from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
 
-from huggingface_hub import HfFolder, delete_repo
-from huggingface_hub.file_download import http_get
 from parameterized import parameterized
-from requests.exceptions import HTTPError
 
 from transformers import (
     AlbertTokenizer,
     AlbertTokenizerFast,
-    AutoTokenizer,
     BertTokenizer,
     BertTokenizerFast,
-    GPT2TokenizerFast,
     PreTrainedTokenizer,
     PreTrainedTokenizerBase,
     PreTrainedTokenizerFast,
@@ -51,24 +43,20 @@ from transformers import (
     TrainingArguments,
     is_flax_available,
     is_tf_available,
-    is_tokenizers_available,
     is_torch_available,
     logging,
 )
 from transformers.testing_utils import (
-    TOKEN,
-    USER,
     check_json_file_has_correct_format,
     get_tests_dir,
     is_pt_tf_cross_test,
-    is_staging_test,
     require_tf,
     require_tokenizers,
     require_torch,
     run_test_in_subprocess,
     slow,
 )
-from transformers.tokenization_utils import AddedToken, Trie
+from transformers.tokenization_utils import AddedToken
 
 
 if is_torch_available():
@@ -79,15 +67,6 @@ if TYPE_CHECKING:
     from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel
 
 
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
-
-from test_module.custom_tokenization import CustomTokenizer  # noqa E402
-
-
-if is_tokenizers_available():
-    from test_module.custom_tokenization_fast import CustomTokenizerFast
-
-
 logger = logging.get_logger(__name__)
 
 NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]
@@ -3974,238 +3953,3 @@ class TokenizerTesterMixin:
         tokenizer.clean_up_tokenization_spaces = True
         decoded = tokenizer.decode(tokens)
         assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
-
-
-class TokenizerUtilTester(unittest.TestCase):
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Under the mock environment we get a 500 error when trying to reach the tokenizer.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            _ = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    @require_tokenizers
-    def test_cached_files_are_used_when_internet_is_down_missing_files(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = GPT2TokenizerFast.from_pretrained("gpt2")
-
-        # Under the mock environment we get a 500 error when trying to reach the tokenizer.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            _ = GPT2TokenizerFast.from_pretrained("gpt2")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_legacy_load_from_one_file(self):
-        # This test is for deprecated behavior and can be removed in v5
-        try:
-            tmp_file = tempfile.mktemp()
-            with open(tmp_file, "wb") as f:
-                http_get("https://huggingface.co/albert-base-v1/resolve/main/spiece.model", f)
-
-            _ = AlbertTokenizer.from_pretrained(tmp_file)
-        finally:
-            os.remove(tmp_file)
-
-        # Supporting this legacy load introduced a weird bug where the tokenizer would load local files if they are in
-        # the current folder and have the right name.
-        if os.path.isfile("tokenizer.json"):
-            # We skip the test if the user has a `tokenizer.json` in this folder to avoid deleting it.
-            return
-        try:
-            with open("tokenizer.json", "wb") as f:
-                http_get("https://huggingface.co/hf-internal-testing/tiny-random-bert/blob/main/tokenizer.json", f)
-            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-            # The tiny random BERT has a vocab size of 1024, tiny gpt2 as a vocab size of 1000
-            self.assertEqual(tokenizer.vocab_size, 1000)
-            # Tokenizer should depend on the remote checkpoint, not the local tokenizer.json file.
-
-        finally:
-            os.remove("tokenizer.json")
-
-    def test_legacy_load_from_url(self):
-        # This test is for deprecated behavior and can be removed in v5
-        _ = AlbertTokenizer.from_pretrained("https://huggingface.co/albert-base-v1/resolve/main/spiece.model")
-
-
-@is_staging_test
-class TokenizerPushToHubTester(unittest.TestCase):
-    vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
-
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-tokenizer")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-tokenizer-org")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-dynamic-tokenizer")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            vocab_file = os.path.join(tmp_dir, "vocab.txt")
-            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
-                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
-            tokenizer = BertTokenizer(vocab_file)
-
-        tokenizer.push_to_hub("test-tokenizer", use_auth_token=self._token)
-        new_tokenizer = BertTokenizer.from_pretrained(f"{USER}/test-tokenizer")
-        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-tokenizer")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            tokenizer.save_pretrained(tmp_dir, repo_id="test-tokenizer", push_to_hub=True, use_auth_token=self._token)
-
-        new_tokenizer = BertTokenizer.from_pretrained(f"{USER}/test-tokenizer")
-        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
-
-    def test_push_to_hub_in_organization(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            vocab_file = os.path.join(tmp_dir, "vocab.txt")
-            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
-                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
-            tokenizer = BertTokenizer(vocab_file)
-
-        tokenizer.push_to_hub("valid_org/test-tokenizer-org", use_auth_token=self._token)
-        new_tokenizer = BertTokenizer.from_pretrained("valid_org/test-tokenizer-org")
-        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-tokenizer-org")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            tokenizer.save_pretrained(
-                tmp_dir, repo_id="valid_org/test-tokenizer-org", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_tokenizer = BertTokenizer.from_pretrained("valid_org/test-tokenizer-org")
-        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
-
-    @require_tokenizers
-    def test_push_to_hub_dynamic_tokenizer(self):
-        CustomTokenizer.register_for_auto_class()
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            vocab_file = os.path.join(tmp_dir, "vocab.txt")
-            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
-                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
-            tokenizer = CustomTokenizer(vocab_file)
-
-        # No fast custom tokenizer
-        tokenizer.push_to_hub("test-dynamic-tokenizer", use_auth_token=self._token)
-
-        tokenizer = AutoTokenizer.from_pretrained(f"{USER}/test-dynamic-tokenizer", trust_remote_code=True)
-        # Can't make an isinstance check because the new_model.config is from the CustomTokenizer class of a dynamic module
-        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer")
-
-        # Fast and slow custom tokenizer
-        CustomTokenizerFast.register_for_auto_class()
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            vocab_file = os.path.join(tmp_dir, "vocab.txt")
-            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
-                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
-
-            bert_tokenizer = BertTokenizerFast.from_pretrained(tmp_dir)
-            bert_tokenizer.save_pretrained(tmp_dir)
-            tokenizer = CustomTokenizerFast.from_pretrained(tmp_dir)
-
-        tokenizer.push_to_hub("test-dynamic-tokenizer", use_auth_token=self._token)
-
-        tokenizer = AutoTokenizer.from_pretrained(f"{USER}/test-dynamic-tokenizer", trust_remote_code=True)
-        # Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
-        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizerFast")
-        tokenizer = AutoTokenizer.from_pretrained(
-            f"{USER}/test-dynamic-tokenizer", use_fast=False, trust_remote_code=True
-        )
-        # Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
-        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer")
-
-
-class TrieTest(unittest.TestCase):
-    def test_trie(self):
-        trie = Trie()
-        trie.add("Hello 友達")
-        self.assertEqual(trie.data, {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}})
-        trie.add("Hello")
-        trie.data
-        self.assertEqual(trie.data, {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}})
-
-    def test_trie_split(self):
-        trie = Trie()
-        self.assertEqual(trie.split("[CLS] This is a extra_id_100"), ["[CLS] This is a extra_id_100"])
-        trie.add("[CLS]")
-        trie.add("extra_id_1")
-        trie.add("extra_id_100")
-        self.assertEqual(trie.split("[CLS] This is a extra_id_100"), ["[CLS]", " This is a ", "extra_id_100"])
-
-    def test_trie_single(self):
-        trie = Trie()
-        trie.add("A")
-        self.assertEqual(trie.split("ABC"), ["A", "BC"])
-        self.assertEqual(trie.split("BCA"), ["BC", "A"])
-
-    def test_trie_final(self):
-        trie = Trie()
-        trie.add("TOKEN]")
-        trie.add("[SPECIAL_TOKEN]")
-        self.assertEqual(trie.split("This is something [SPECIAL_TOKEN]"), ["This is something ", "[SPECIAL_TOKEN]"])
-
-    def test_trie_subtokens(self):
-        trie = Trie()
-        trie.add("A")
-        trie.add("P")
-        trie.add("[SPECIAL_TOKEN]")
-        self.assertEqual(trie.split("This is something [SPECIAL_TOKEN]"), ["This is something ", "[SPECIAL_TOKEN]"])
-
-    def test_trie_suffix_tokens(self):
-        trie = Trie()
-        trie.add("AB")
-        trie.add("B")
-        trie.add("C")
-        self.assertEqual(trie.split("ABC"), ["AB", "C"])
-
-    def test_trie_skip(self):
-        trie = Trie()
-        trie.add("ABC")
-        trie.add("B")
-        trie.add("CD")
-        self.assertEqual(trie.split("ABCD"), ["ABC", "D"])
-
-    def test_cut_text_hardening(self):
-        # Even if the offsets are wrong, we necessarily output correct string
-        # parts.
-        trie = Trie()
-        parts = trie.cut_text("ABC", [0, 0, 2, 1, 2, 3])
-        self.assertEqual(parts, ["AB", "C"])
diff --git a/tests/test_tokenization_utils.py b/tests/test_tokenization_utils.py
new file mode 100644
index 0000000000..2984de97fd
--- /dev/null
+++ b/tests/test_tokenization_utils.py
@@ -0,0 +1,280 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import tempfile
+import unittest
+import unittest.mock as mock
+from pathlib import Path
+
+from huggingface_hub import HfFolder, delete_repo
+from huggingface_hub.file_download import http_get
+from requests.exceptions import HTTPError
+
+from transformers import (
+    AlbertTokenizer,
+    AutoTokenizer,
+    BertTokenizer,
+    BertTokenizerFast,
+    GPT2TokenizerFast,
+    is_tokenizers_available,
+)
+from transformers.testing_utils import TOKEN, USER, is_staging_test, require_tokenizers
+from transformers.tokenization_utils import Trie
+
+
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_tokenization import CustomTokenizer  # noqa E402
+
+
+if is_tokenizers_available():
+    from test_module.custom_tokenization_fast import CustomTokenizerFast
+
+
+class TokenizerUtilTester(unittest.TestCase):
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Under the mock environment we get a 500 error when trying to reach the tokenizer.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    @require_tokenizers
+    def test_cached_files_are_used_when_internet_is_down_missing_files(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = GPT2TokenizerFast.from_pretrained("gpt2")
+
+        # Under the mock environment we get a 500 error when trying to reach the tokenizer.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = GPT2TokenizerFast.from_pretrained("gpt2")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_legacy_load_from_one_file(self):
+        # This test is for deprecated behavior and can be removed in v5
+        try:
+            tmp_file = tempfile.mktemp()
+            with open(tmp_file, "wb") as f:
+                http_get("https://huggingface.co/albert-base-v1/resolve/main/spiece.model", f)
+
+            _ = AlbertTokenizer.from_pretrained(tmp_file)
+        finally:
+            os.remove(tmp_file)
+
+        # Supporting this legacy load introduced a weird bug where the tokenizer would load local files if they are in
+        # the current folder and have the right name.
+        if os.path.isfile("tokenizer.json"):
+            # We skip the test if the user has a `tokenizer.json` in this folder to avoid deleting it.
+            return
+        try:
+            with open("tokenizer.json", "wb") as f:
+                http_get("https://huggingface.co/hf-internal-testing/tiny-random-bert/blob/main/tokenizer.json", f)
+            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+            # The tiny random BERT has a vocab size of 1024, tiny gpt2 as a vocab size of 1000
+            self.assertEqual(tokenizer.vocab_size, 1000)
+            # Tokenizer should depend on the remote checkpoint, not the local tokenizer.json file.
+
+        finally:
+            os.remove("tokenizer.json")
+
+    def test_legacy_load_from_url(self):
+        # This test is for deprecated behavior and can be removed in v5
+        _ = AlbertTokenizer.from_pretrained("https://huggingface.co/albert-base-v1/resolve/main/spiece.model")
+
+
+@is_staging_test
+class TokenizerPushToHubTester(unittest.TestCase):
+    vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
+
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-tokenizer")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-tokenizer-org")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-dynamic-tokenizer")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            vocab_file = os.path.join(tmp_dir, "vocab.txt")
+            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
+            tokenizer = BertTokenizer(vocab_file)
+
+        tokenizer.push_to_hub("test-tokenizer", use_auth_token=self._token)
+        new_tokenizer = BertTokenizer.from_pretrained(f"{USER}/test-tokenizer")
+        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-tokenizer")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tokenizer.save_pretrained(tmp_dir, repo_id="test-tokenizer", push_to_hub=True, use_auth_token=self._token)
+
+        new_tokenizer = BertTokenizer.from_pretrained(f"{USER}/test-tokenizer")
+        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
+
+    def test_push_to_hub_in_organization(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            vocab_file = os.path.join(tmp_dir, "vocab.txt")
+            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
+            tokenizer = BertTokenizer(vocab_file)
+
+        tokenizer.push_to_hub("valid_org/test-tokenizer-org", use_auth_token=self._token)
+        new_tokenizer = BertTokenizer.from_pretrained("valid_org/test-tokenizer-org")
+        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-tokenizer-org")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tokenizer.save_pretrained(
+                tmp_dir, repo_id="valid_org/test-tokenizer-org", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_tokenizer = BertTokenizer.from_pretrained("valid_org/test-tokenizer-org")
+        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
+
+    @require_tokenizers
+    def test_push_to_hub_dynamic_tokenizer(self):
+        CustomTokenizer.register_for_auto_class()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            vocab_file = os.path.join(tmp_dir, "vocab.txt")
+            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
+            tokenizer = CustomTokenizer(vocab_file)
+
+        # No fast custom tokenizer
+        tokenizer.push_to_hub("test-dynamic-tokenizer", use_auth_token=self._token)
+
+        tokenizer = AutoTokenizer.from_pretrained(f"{USER}/test-dynamic-tokenizer", trust_remote_code=True)
+        # Can't make an isinstance check because the new_model.config is from the CustomTokenizer class of a dynamic module
+        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer")
+
+        # Fast and slow custom tokenizer
+        CustomTokenizerFast.register_for_auto_class()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            vocab_file = os.path.join(tmp_dir, "vocab.txt")
+            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
+
+            bert_tokenizer = BertTokenizerFast.from_pretrained(tmp_dir)
+            bert_tokenizer.save_pretrained(tmp_dir)
+            tokenizer = CustomTokenizerFast.from_pretrained(tmp_dir)
+
+        tokenizer.push_to_hub("test-dynamic-tokenizer", use_auth_token=self._token)
+
+        tokenizer = AutoTokenizer.from_pretrained(f"{USER}/test-dynamic-tokenizer", trust_remote_code=True)
+        # Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
+        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizerFast")
+        tokenizer = AutoTokenizer.from_pretrained(
+            f"{USER}/test-dynamic-tokenizer", use_fast=False, trust_remote_code=True
+        )
+        # Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
+        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer")
+
+
+class TrieTest(unittest.TestCase):
+    def test_trie(self):
+        trie = Trie()
+        trie.add("Hello 友達")
+        self.assertEqual(trie.data, {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}})
+        trie.add("Hello")
+        trie.data
+        self.assertEqual(trie.data, {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}})
+
+    def test_trie_split(self):
+        trie = Trie()
+        self.assertEqual(trie.split("[CLS] This is a extra_id_100"), ["[CLS] This is a extra_id_100"])
+        trie.add("[CLS]")
+        trie.add("extra_id_1")
+        trie.add("extra_id_100")
+        self.assertEqual(trie.split("[CLS] This is a extra_id_100"), ["[CLS]", " This is a ", "extra_id_100"])
+
+    def test_trie_single(self):
+        trie = Trie()
+        trie.add("A")
+        self.assertEqual(trie.split("ABC"), ["A", "BC"])
+        self.assertEqual(trie.split("BCA"), ["BC", "A"])
+
+    def test_trie_final(self):
+        trie = Trie()
+        trie.add("TOKEN]")
+        trie.add("[SPECIAL_TOKEN]")
+        self.assertEqual(trie.split("This is something [SPECIAL_TOKEN]"), ["This is something ", "[SPECIAL_TOKEN]"])
+
+    def test_trie_subtokens(self):
+        trie = Trie()
+        trie.add("A")
+        trie.add("P")
+        trie.add("[SPECIAL_TOKEN]")
+        self.assertEqual(trie.split("This is something [SPECIAL_TOKEN]"), ["This is something ", "[SPECIAL_TOKEN]"])
+
+    def test_trie_suffix_tokens(self):
+        trie = Trie()
+        trie.add("AB")
+        trie.add("B")
+        trie.add("C")
+        self.assertEqual(trie.split("ABC"), ["AB", "C"])
+
+    def test_trie_skip(self):
+        trie = Trie()
+        trie.add("ABC")
+        trie.add("B")
+        trie.add("CD")
+        self.assertEqual(trie.split("ABCD"), ["ABC", "D"])
+
+    def test_cut_text_hardening(self):
+        # Even if the offsets are wrong, we necessarily output correct string
+        # parts.
+        trie = Trie()
+        parts = trie.cut_text("ABC", [0, 0, 2, 1, 2, 3])
+        self.assertEqual(parts, ["AB", "C"])