Remove all traces of low_cpu_mem_usage (#38792)

* remove it from all py files * remove it from the doc * remove it from examples * style * remove traces of _fast_init * Update test_peft_integration.py * CIs
2025-06-12 16:39:33 +02:00
parent 3542e0b844
commit 4b8ec667e9
76 changed files with 100 additions and 598 deletions
--- a/tests/models/bamba/test_modeling_bamba.py
+++ b/tests/models/bamba/test_modeling_bamba.py
@@ -556,7 +556,6 @@ class BambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
                        tmpdirname,
                        torch_dtype=torch.float16,
                        attn_implementation="flash_attention_2",
-                        low_cpu_mem_usage=True,
                    )
                    .to(torch_device)
                    .eval()
@@ -600,7 +599,7 @@ class BambaModelIntegrationTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        model_id = "ibm-fms/Bamba-9B"
-        cls.model = BambaForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
+        cls.model = BambaForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
        cls.tokenizer = AutoTokenizer.from_pretrained(model_id)

        # feels a bit forced to have to do this for the generation test
--- a/tests/models/cohere/test_modeling_cohere.py
+++ b/tests/models/cohere/test_modeling_cohere.py
@@ -238,9 +238,7 @@ class CohereIntegrationTest(unittest.TestCase):
        ).to(device=torch_device, dtype=torch.float16)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        model = CohereForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to(
-            torch_device
-        )
+        model = CohereForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device)

        tokenizer.pad_token = tokenizer.eos_token

--- a/tests/models/cohere2/test_modeling_cohere2.py
+++ b/tests/models/cohere2/test_modeling_cohere2.py
@@ -144,7 +144,7 @@ class Cohere2IntegrationTest(unittest.TestCase):
        ]

        model = AutoModelForCausalLM.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager"
+            model_id, torch_dtype=torch.bfloat16, attn_implementation="eager"
        ).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -168,7 +168,7 @@ class Cohere2IntegrationTest(unittest.TestCase):
        # fmt: on

        model = AutoModelForCausalLM.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager"
+            model_id, torch_dtype=torch.float16, attn_implementation="eager"
        ).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -189,7 +189,7 @@ class Cohere2IntegrationTest(unittest.TestCase):
        ]

        model = AutoModelForCausalLM.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
+            model_id, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
        ).to(torch_device)
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
--- a/tests/models/dac/test_modeling_dac.py
+++ b/tests/models/dac/test_modeling_dac.py
@@ -280,18 +280,6 @@ class DacModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    def test_hidden_states_output(self):
        pass

-    @unittest.skip("No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
    def test_determinism(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

--- a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py
+++ b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py
@@ -459,7 +459,6 @@ class DeepseekV3ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
        model_sdpa = DeepseekV3ForCausalLM.from_pretrained(
            "bzantium/tiny-deepseek-v3",
            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
        ).to(torch_device)

        self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
@@ -467,7 +466,6 @@ class DeepseekV3ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
        model_eager = DeepseekV3ForCausalLM.from_pretrained(
            "bzantium/tiny-deepseek-v3",
            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
            attn_implementation="eager",
        ).to(torch_device)

--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -605,18 +605,6 @@ class DeformableDetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Te
                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                    )

-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
    def test_two_stage_training(self):
        model_class = DeformableDetrForObjectDetection
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
--- a/tests/models/diffllama/test_modeling_diffllama.py
+++ b/tests/models/diffllama/test_modeling_diffllama.py
@@ -514,7 +514,6 @@ class DiffLlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
        model_sdpa = DiffLlamaForCausalLM.from_pretrained(
            "kajuma/DiffLlama-0.3B-handcut",
            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
        ).to(torch_device)

        self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
@@ -522,7 +521,6 @@ class DiffLlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
        model_eager = DiffLlamaForCausalLM.from_pretrained(
            "kajuma/DiffLlama-0.3B-handcut",
            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
            attn_implementation="eager",
        ).to(torch_device)

--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -343,18 +343,6 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
    def test_hidden_states_output(self):
        pass

-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
    def test_determinism(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

--- a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
+++ b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
@@ -381,9 +381,7 @@ class FalconMambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
    @slow
    # Ignore copy
    def test_model_from_pretrained(self):
-        model = FalconMambaModel.from_pretrained(
-            "tiiuae/falcon-mamba-7b", torch_dtype=torch.float16, low_cpu_mem_usage=True
-        )
+        model = FalconMambaModel.from_pretrained("tiiuae/falcon-mamba-7b", torch_dtype=torch.float16)
        self.assertIsNotNone(model)

    def test_model_outputs_equivalence(self):
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -126,9 +126,7 @@ class GemmaIntegrationTest(unittest.TestCase):
            "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
        ]

-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to(
-            torch_device
-        )
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device)

        model.generation_config.cache_implementation = "static"

@@ -149,9 +147,7 @@ class GemmaIntegrationTest(unittest.TestCase):
            "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
        ]

-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
-            torch_device
-        )
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -171,9 +167,7 @@ class GemmaIntegrationTest(unittest.TestCase):
        ]

        # bfloat16 gives strange values, likely due to it has lower precision + very short prompts
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager"
-        )
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, attn_implementation="eager")
        model.to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -195,7 +189,7 @@ class GemmaIntegrationTest(unittest.TestCase):
        ]

        model = AutoModelForCausalLM.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+            model_id, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
        )
        model.to(torch_device)

@@ -216,7 +210,7 @@ class GemmaIntegrationTest(unittest.TestCase):
            "Hi today I'd like to share with you my experience with the new wattpad wattpad wattpad wattpad wattpad wattpad wattpad",
        ]

-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True)
+        model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -235,7 +229,7 @@ class GemmaIntegrationTest(unittest.TestCase):
            "Hi,\n\nI have a problem with my 2005 1.6 16",
        ]

-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(torch_device)
+        model = AutoModelForCausalLM.from_pretrained(model_id).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -256,9 +250,7 @@ class GemmaIntegrationTest(unittest.TestCase):
            "Hi today I am going to show you how to make a simple and easy to make a DIY 3D",
        ]

-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to(
-            torch_device
-        )
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -290,9 +282,7 @@ class GemmaIntegrationTest(unittest.TestCase):
        # fmt: on
        expected_text = EXPECTED_TEXTS.get_expectation()

-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
-            torch_device
-        )
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -312,9 +302,7 @@ class GemmaIntegrationTest(unittest.TestCase):
            "Hi today I am going to show you how to make a simple and easy to make a DIY 3D",
        ]

-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to(
-            torch_device
-        )
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device)

        model.generation_config.cache_implementation = "static"

@@ -333,7 +321,7 @@ class GemmaIntegrationTest(unittest.TestCase):
            "Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very",
        ]

-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True)
+        model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -451,9 +439,7 @@ class GemmaIntegrationTest(unittest.TestCase):
            "Hi today we have the review for a <strong>2016/2017</strong> season of",
        ]

-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
-            torch_device
-        )
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
--- a/tests/models/gemma2/test_modeling_gemma2.py
+++ b/tests/models/gemma2/test_modeling_gemma2.py
@@ -197,7 +197,7 @@ class Gemma2IntegrationTest(unittest.TestCase):
        ]

        model = AutoModelForCausalLM.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager"
+            model_id, torch_dtype=torch.bfloat16, attn_implementation="eager"
        ).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -218,7 +218,7 @@ class Gemma2IntegrationTest(unittest.TestCase):
        ]

        model = AutoModelForCausalLM.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager"
+            model_id, torch_dtype=torch.float16, attn_implementation="eager"
        ).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -241,7 +241,7 @@ class Gemma2IntegrationTest(unittest.TestCase):
        ]

        model = AutoModelForCausalLM.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
+            model_id, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
        ).to(torch_device)
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
@@ -271,7 +271,7 @@ class Gemma2IntegrationTest(unittest.TestCase):
        EXPECTED_BATCH_TEXT = EXPECTED_BATCH_TEXTS.get_expectation()

        model = AutoModelForCausalLM.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
+            model_id, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
        ).to(torch_device)
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
@@ -419,7 +419,7 @@ class Gemma2IntegrationTest(unittest.TestCase):
        ]

        model = AutoModelForCausalLM.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
+            model_id, torch_dtype=torch.bfloat16, attn_implementation="flex_attention"
        ).to(torch_device)
        assert model.config._attn_implementation == "flex_attention"
        tokenizer = AutoTokenizer.from_pretrained(model_id)
--- a/tests/models/gemma3/test_modeling_gemma3.py
+++ b/tests/models/gemma3/test_modeling_gemma3.py
@@ -391,9 +391,7 @@ class Gemma3IntegrationTest(unittest.TestCase):
    def test_model_4b_bf16(self):
        model_id = "google/gemma-3-4b-it"

-        model = Gemma3ForConditionalGeneration.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
-        ).to(torch_device)
+        model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)

        inputs = self.processor.apply_chat_template(
            self.messages,
@@ -421,9 +419,7 @@ class Gemma3IntegrationTest(unittest.TestCase):
    def test_model_4b_batch(self):
        model_id = "google/gemma-3-4b-it"

-        model = Gemma3ForConditionalGeneration.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
-        ).to(torch_device)
+        model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)

        messages_2 = [
            {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
@@ -474,9 +470,7 @@ class Gemma3IntegrationTest(unittest.TestCase):
    def test_model_4b_crops(self):
        model_id = "google/gemma-3-4b-it"

-        model = Gemma3ForConditionalGeneration.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
-        ).to(torch_device)
+        model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)

        crop_config = {
            "images_kwargs": {
@@ -516,9 +510,7 @@ class Gemma3IntegrationTest(unittest.TestCase):
    def test_model_4b_batch_crops(self):
        model_id = "google/gemma-3-4b-it"

-        model = Gemma3ForConditionalGeneration.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
-        ).to(torch_device)
+        model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)
        crop_config = {
            "images_kwargs": {
                "do_pan_and_scan": True,
@@ -576,9 +568,7 @@ class Gemma3IntegrationTest(unittest.TestCase):
    def test_model_4b_multiimage(self):
        model_id = "google/gemma-3-4b-it"

-        model = Gemma3ForConditionalGeneration.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
-        ).to(torch_device)
+        model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)

        messages = [
            {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
@@ -616,9 +606,7 @@ class Gemma3IntegrationTest(unittest.TestCase):
    def test_model_1b_text_only(self):
        model_id = "google/gemma-3-1b-it"

-        model = Gemma3ForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
-            torch_device
-        )
+        model = Gemma3ForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)
        tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
        inputs = tokenizer("Write a poem about Machine Learning.", return_tensors="pt").to(torch_device)

--- a/tests/models/glm/test_modeling_glm.py
+++ b/tests/models/glm/test_modeling_glm.py
@@ -88,7 +88,7 @@ class GlmIntegrationTest(unittest.TestCase):
        ]

        model = AutoModelForCausalLM.from_pretrained(
-            self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, revision=self.revision
+            self.model_id, torch_dtype=torch.float16, revision=self.revision
        ).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
@@ -106,7 +106,7 @@ class GlmIntegrationTest(unittest.TestCase):
        ]

        model = AutoModelForCausalLM.from_pretrained(
-            self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, revision=self.revision
+            self.model_id, torch_dtype=torch.bfloat16, revision=self.revision
        ).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision)
@@ -125,7 +125,6 @@ class GlmIntegrationTest(unittest.TestCase):

        model = AutoModelForCausalLM.from_pretrained(
            self.model_id,
-            low_cpu_mem_usage=True,
            torch_dtype=torch.bfloat16,
            attn_implementation="eager",
            revision=self.revision,
@@ -149,7 +148,6 @@ class GlmIntegrationTest(unittest.TestCase):

        model = AutoModelForCausalLM.from_pretrained(
            self.model_id,
-            low_cpu_mem_usage=True,
            torch_dtype=torch.bfloat16,
            attn_implementation="sdpa",
            revision=self.revision,
@@ -174,7 +172,6 @@ class GlmIntegrationTest(unittest.TestCase):

        model = AutoModelForCausalLM.from_pretrained(
            self.model_id,
-            low_cpu_mem_usage=True,
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2",
            revision=self.revision,
--- a/tests/models/glm4/test_modeling_glm4.py
+++ b/tests/models/glm4/test_modeling_glm4.py
@@ -104,9 +104,7 @@ class Glm4IntegrationTest(unittest.TestCase):
        )
        EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()

-        model = AutoModelForCausalLM.from_pretrained(
-            self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16
-        ).to(torch_device)
+        model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.float16).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -132,9 +130,7 @@ class Glm4IntegrationTest(unittest.TestCase):
        )
        EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()

-        model = AutoModelForCausalLM.from_pretrained(
-            self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
-        ).to(torch_device)
+        model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.bfloat16).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -162,7 +158,6 @@ class Glm4IntegrationTest(unittest.TestCase):

        model = AutoModelForCausalLM.from_pretrained(
            self.model_id,
-            low_cpu_mem_usage=True,
            torch_dtype=torch.bfloat16,
            attn_implementation="eager",
        )
@@ -195,7 +190,6 @@ class Glm4IntegrationTest(unittest.TestCase):

        model = AutoModelForCausalLM.from_pretrained(
            self.model_id,
-            low_cpu_mem_usage=True,
            torch_dtype=torch.bfloat16,
            attn_implementation="sdpa",
        )
@@ -226,7 +220,6 @@ class Glm4IntegrationTest(unittest.TestCase):

        model = AutoModelForCausalLM.from_pretrained(
            self.model_id,
-            low_cpu_mem_usage=True,
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2",
        )
--- a/tests/models/helium/test_modeling_helium.py
+++ b/tests/models/helium/test_modeling_helium.py
@@ -87,9 +87,9 @@ class HeliumIntegrationTest(unittest.TestCase):
            "Hello, today is a great day to start a new project. I have been working on a new project for a while now and I have"
        ]

-        model = AutoModelForCausalLM.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, revision="refs/pr/1"
-        ).to(torch_device)
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, revision="refs/pr/1").to(
+            torch_device
+        )
        tokenizer = AutoTokenizer.from_pretrained(model_id, revision="refs/pr/1")
        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)

--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -727,7 +727,7 @@ class InstructBlipModelIntegrationTest(unittest.TestCase):
    def test_inference_vicuna_7b(self):
        processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
        model = InstructBlipForConditionalGeneration.from_pretrained(
-            "Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True
+            "Salesforce/instructblip-vicuna-7b", load_in_8bit=True
        )

        url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
@@ -752,7 +752,6 @@ class InstructBlipModelIntegrationTest(unittest.TestCase):
        model = InstructBlipForConditionalGeneration.from_pretrained(
            "Salesforce/instructblip-flan-t5-xl",
            torch_dtype=torch.bfloat16,
-            low_cpu_mem_usage=True,
        ).to(torch_device)

        url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
@@ -789,7 +788,6 @@ class InstructBlipModelIntegrationTest(unittest.TestCase):
        model = InstructBlipForConditionalGeneration.from_pretrained(
            "Salesforce/instructblip-flan-t5-xl",
            torch_dtype=torch.bfloat16,
-            low_cpu_mem_usage=True,
        ).to(torch_device)
        processor.image_processor.size = {"height": 500, "width": 500}

@@ -810,7 +808,6 @@ class InstructBlipModelIntegrationTest(unittest.TestCase):
        model = InstructBlipForConditionalGeneration.from_pretrained(
            "Salesforce/instructblip-flan-t5-xl",
            torch_dtype=torch.bfloat16,
-            low_cpu_mem_usage=True,
        ).to(torch_device)

        image = prepare_img()
--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@@ -744,7 +744,8 @@ class InstructBlipVideoModelIntegrationTest(unittest.TestCase):
    def test_inference_vicuna_7b(self):
        processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
        model = InstructBlipVideoForConditionalGeneration.from_pretrained(
-            "Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True
+            "Salesforce/instructblip-vicuna-7b",
+            load_in_8bit=True,
        )

        clip = prepare_video()
@@ -762,7 +763,8 @@ class InstructBlipVideoModelIntegrationTest(unittest.TestCase):
    def test_expansion_in_processing(self):
        processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
        model = InstructBlipVideoForConditionalGeneration.from_pretrained(
-            "Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True
+            "Salesforce/instructblip-vicuna-7b",
+            load_in_8bit=True,
        )

        clip = prepare_video()
--- a/tests/models/jamba/test_modeling_jamba.py
+++ b/tests/models/jamba/test_modeling_jamba.py
@@ -527,7 +527,6 @@ class JambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
                    tmpdirname,
                    torch_dtype=torch.float16,
                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
                    load_in_4bit=True,
                )

@@ -563,7 +562,10 @@ class JambaModelIntegrationTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        model_id = "ai21labs/Jamba-tiny-dev"
-        cls.model = JambaForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
+        cls.model = JambaForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.bfloat16,
+        )
        cls.tokenizer = AutoTokenizer.from_pretrained(model_id)
        cls.device_properties = get_device_properties()

--- a/tests/models/lxmert/test_modeling_lxmert.py
+++ b/tests/models/lxmert/test_modeling_lxmert.py
@@ -765,18 +765,6 @@ class LxmertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):

        return tf_inputs_dict

-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
    @unittest.skip(
        reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
    )
--- a/tests/models/marian/test_modeling_marian.py
+++ b/tests/models/marian/test_modeling_marian.py
@@ -351,18 +351,6 @@ class MarianModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
    def test_training_gradient_checkpointing_use_reentrant_false(self):
        pass

-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-

 def assert_tensors_close(a, b, atol=1e-12, prefix=""):
    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
--- a/tests/models/minimax/test_modeling_minimax.py
+++ b/tests/models/minimax/test_modeling_minimax.py
@@ -246,9 +246,10 @@ class MiniMaxIntegrationTest(unittest.TestCase):
        model_id = "hf-internal-testing/MiniMax-tiny"
        dummy_input = torch.LongTensor([[0, 1, 0], [0, 1, 0]]).to(torch_device)

-        model = MiniMaxForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to(
-            torch_device
-        )
+        model = MiniMaxForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.bfloat16,
+        ).to(torch_device)
        expected_slice = torch.tensor(
            [[1.0312, -0.5156, -0.3262], [-0.1152, 0.4336, 0.2412], [1.2188, -0.5898, -0.0381]]
        ).to(torch_device)
@@ -265,9 +266,10 @@ class MiniMaxIntegrationTest(unittest.TestCase):
        model_id = "hf-internal-testing/MiniMax-tiny"
        dummy_input = torch.LongTensor([[0, 1, 0], [0, 1, 0]]).to(torch_device)

-        model = MiniMaxForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to(
-            torch_device
-        )
+        model = MiniMaxForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.bfloat16,
+        ).to(torch_device)
        expected_slice = (
            torch.tensor([[0, 1, 0, 933, 307, 3102, 2457, 1208], [0, 1, 0, 933, 307, 3102, 2457, 1208]])
            .to(torch.int64)
--- a/tests/models/mixtral/test_modeling_mixtral.py
+++ b/tests/models/mixtral/test_modeling_mixtral.py
@@ -156,9 +156,10 @@ class MixtralIntegrationTest(unittest.TestCase):
        model_id = "hf-internal-testing/Mixtral-tiny"
        dummy_input = torch.LongTensor([[0, 1, 0], [0, 1, 0]]).to(torch_device)

-        model = MixtralForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to(
-            torch_device
-        )
+        model = MixtralForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.bfloat16,
+        ).to(torch_device)
        # TODO: might need to tweak it in case the logits do not match on our daily runners
        # these logits have been obtained with the original megablocks implementation.
        # ("cuda", 8) for A100/A10, and ("cuda", 7) for T4
@@ -189,9 +190,10 @@ class MixtralIntegrationTest(unittest.TestCase):
        dummy_input = torch.LongTensor([[0, 0, 0, 0, 0, 0, 1, 2, 3], [1, 1, 2, 3, 4, 5, 6, 7, 8]]).to(torch_device)
        attention_mask = dummy_input.ne(0).to(torch.long)

-        model = MixtralForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to(
-            torch_device
-        )
+        model = MixtralForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.bfloat16,
+        ).to(torch_device)

        # TODO: might need to tweak it in case the logits do not match on our daily runners
        #
--- a/tests/models/moshi/test_modeling_moshi.py
+++ b/tests/models/moshi/test_modeling_moshi.py
@@ -722,7 +722,6 @@ class MoshiTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
                model_sdpa = model_class.from_pretrained(
                    tmpdirname,
                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=True,
                ).to(torch_device)

                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
@@ -730,7 +729,6 @@ class MoshiTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
                model_eager = model_class.from_pretrained(
                    tmpdirname,
                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=True,
                    attn_implementation="eager",
                ).to(torch_device)

--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -788,18 +788,6 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
    def test_tied_weights_keys(self):
        pass

-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
    # override since changing `output_hidden_states` / `output_attentions` from the top-level model config won't work
    def test_retain_grad_hidden_states_attentions(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
--- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
@@ -789,18 +789,6 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
    def test_tied_weights_keys(self):
        pass

-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
    # override since changing `output_hidden_states` / `output_attentions` from the top-level model config won't work
    # Ignore copy
    def test_retain_grad_hidden_states_attentions(self):
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -326,18 +326,6 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
    def test_feed_forward_chunking(self):
        pass

-    @unittest.skip(reason="PaliGemma does not support low_cpu_mem_usage.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip(reason="PaliGemma does not support low_cpu_mem_usage.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip(reason="PaliGemma does not support low_cpu_mem_usage.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
    @unittest.skip(
        reason="VLMs doesn't accept inputs embeds and pixel values at the same time. So if the test passed for backbone LM, it passes for VLM also"
    )
--- a/tests/models/paligemma2/test_modeling_paligemma2.py
+++ b/tests/models/paligemma2/test_modeling_paligemma2.py
@@ -316,18 +316,6 @@ class PaliGemma2ForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
    def test_feed_forward_chunking(self):
        pass

-    @unittest.skip(reason="PaliGemma does not support low_cpu_mem_usage.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip(reason="PaliGemma does not support low_cpu_mem_usage.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip(reason="PaliGemma does not support low_cpu_mem_usage.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
    @unittest.skip(
        reason="VLMs doesn't accept inputs embeds and pixel values at the same time. So if the test passed for backbone LM, it passes for VLM also"
    )
--- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
@@ -368,10 +368,6 @@ class Qwen2_5_VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
    def test_prompt_lookup_decoding_matches_greedy_search(self):
        super().test_prompt_lookup_decoding_matches_greedy_search()

-    @unittest.skip(reason="The base class is LM only and cannot be init with XModelConfig`")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
    # The multimodal base model embeds will not match ids, due to pixel values. We can't change base test
    # because in some models `pixel_values` are required. Will be fixed when we add support for merging `embeds+pixels`
    # TODO: @raushan
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -318,10 +318,6 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
    def test_generate_from_inputs_embeds_with_static_cache(self):
        pass

-    @unittest.skip(reason="The base class is LM only and cannot be init with XModelConfig`")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
    # The multimodal base model embeds will not match ids, due to pixel values. We can't change base test
    # because in some models `pixel_values` are required. Will be fixed when we add support for merging `embeds+pixels`
    # TODO: @raushan
--- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
+++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
@@ -182,7 +182,9 @@ class RecurrentGemmaIntegrationTest(unittest.TestCase):
    @require_read_token
    def test_2b_generate(self):
        EXPECTED_TEXTS = ['Hello I am doing a project on the topic of "The impact of the internet on the society" and I am looking for some information on the topic. I am looking for some information on the impact of the internet on the society. I am looking for some information on the impact of the internet on the society. I am looking for some', 'Hi today is a new app that allows you to make money by watching videos.\n\nThe app is very simple to use and you can earn money by watching videos.\n\nThe app is available for both Android and iOS devices and you can download it from the Google Play Store or the App Store.\n\nOnce you have downloaded the app']  # fmt: skip
-        model = AutoModelForCausalLM.from_pretrained(self.model_id, low_cpu_mem_usage=True).to(torch_device)
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+        ).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        tokenizer.padding_side = "right"
@@ -204,9 +206,7 @@ class RecurrentGemmaIntegrationTest(unittest.TestCase):

        self.assertEqual(output_text, EXPECTED_TEXTS)

-        model = AutoModelForCausalLM.from_pretrained(
-            self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16
-        ).to(torch_device)
+        model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.float16).to(torch_device)
        output = model.generate(**inputs, max_new_tokens=64, do_sample=False)
        del model
        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
@@ -246,9 +246,7 @@ class RecurrentGemmaIntegrationTest(unittest.TestCase):
    def test_long_context(self):
        EXPECTED_GENERATION = [' Jean-Paul Delannoy told CNN that the BEA is "not aware of any video footage that could have been taken on board the plane." He added that the BEA is "not aware of any video footage that could have been taken on board the plane." The BEA is the French equivalent of the National Transportation Safety Board']  # fmt: skip

-        model = AutoModelForCausalLM.from_pretrained(
-            self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16
-        ).to(torch_device)
+        model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.float16).to(torch_device)
        tokenizer = AutoTokenizer.from_pretrained(self.model_id, padding_side="left")
        inputs = tokenizer(self.input_long_text, return_tensors="pt").to(torch_device)
        output = model.generate(**inputs, max_new_tokens=64, do_sample=False)
@@ -260,9 +258,7 @@ class RecurrentGemmaIntegrationTest(unittest.TestCase):
    def test_longer_than_window(self):
        EXPECTED_GENERATION = [" Robin's comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the"]  # fmt: skip

-        model = AutoModelForCausalLM.from_pretrained(
-            self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16
-        ).to(torch_device)
+        model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.float16).to(torch_device)
        model.config.attention_window_size = 256  # Make the attention window size shorter than the current prompt
        tokenizer = AutoTokenizer.from_pretrained(self.model_id, padding_side="left")
        inputs = tokenizer(self.input_long_text, return_tensors="pt").to(torch_device)
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -248,14 +248,6 @@ class SamVisionModelTest(ModelTesterMixin, unittest.TestCase):
    def test_training_gradient_checkpointing_use_reentrant_false(self):
        pass

-    @unittest.skip(reason="SamVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="SamVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
    @unittest.skip(reason="SamVisionModel does not support training")
    def test_retain_grad_hidden_states_attentions(self):
        pass
--- a/tests/models/sam_hq/test_modeling_sam_hq.py
+++ b/tests/models/sam_hq/test_modeling_sam_hq.py
@@ -256,14 +256,6 @@ class SamHQVisionModelTest(ModelTesterMixin, unittest.TestCase):
    def test_training_gradient_checkpointing_use_reentrant_false(self):
        pass

-    @unittest.skip(reason="SamVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="SamVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
    @unittest.skip(reason="SamVisionModel does not support training")
    def test_retain_grad_hidden_states_attentions(self):
        pass
@@ -695,14 +687,6 @@ class SamHQModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    def test_training_gradient_checkpointing_use_reentrant_false(self):
        pass

-    @unittest.skip(reason="SamHQModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="SamHQModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
    @unittest.skip(reason="SamHQModel does not support training")
    def test_retain_grad_hidden_states_attentions(self):
        pass
--- a/tests/models/sew/test_modeling_sew.py
+++ b/tests/models/sew/test_modeling_sew.py
@@ -325,18 +325,6 @@ class SEWModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    def test_model_get_set_embeddings(self):
        pass

-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
    def test_retain_grad_hidden_states_attentions(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        config.output_hidden_states = True
--- a/tests/models/sew_d/test_modeling_sew_d.py
+++ b/tests/models/sew_d/test_modeling_sew_d.py
@@ -430,18 +430,6 @@ class SEWDModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    def test_feed_forward_chunking(self):
        pass

-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
    @slow
    def test_model_from_pretrained(self):
        model = SEWDModel.from_pretrained("asapp/sew-d-tiny-100k")
--- a/tests/models/shieldgemma2/test_modeling_shieldgemma2.py
+++ b/tests/models/shieldgemma2/test_modeling_shieldgemma2.py
@@ -49,9 +49,9 @@ class ShieldGemma2IntegrationTest(unittest.TestCase):
        response = requests.get(url)
        image = Image.open(BytesIO(response.content))

-        model = ShieldGemma2ForImageClassification.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
-        ).to(torch_device)
+        model = ShieldGemma2ForImageClassification.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(
+            torch_device
+        )

        inputs = processor(images=[image]).to(torch_device)
        output = model(**inputs)
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -1109,14 +1109,16 @@ class T5ModelFp16Tests(unittest.TestCase):

        # Load using `accelerate` in bf16
        model = T5ForConditionalGeneration.from_pretrained(
-            "google-t5/t5-small", torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
+            "google-t5/t5-small",
+            torch_dtype=torch.bfloat16,
        )
        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.bfloat16)
        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.bfloat16)

        # Load without using `accelerate`
        model = T5ForConditionalGeneration.from_pretrained(
-            "google-t5/t5-small", torch_dtype=torch.float16, low_cpu_mem_usage=True
+            "google-t5/t5-small",
+            torch_dtype=torch.float16,
        )
        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.float16)
--- a/tests/models/timm_backbone/test_modeling_timm_backbone.py
+++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py
@@ -156,18 +156,6 @@ class TimmBackboneModelTest(ModelTesterMixin, BackboneTesterMixin, PipelineTeste
    def test_save_load(self):
        pass

-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
    @unittest.skip(reason="TimmBackbone uses its own `from_pretrained` without device_map support")
    def test_can_load_with_device_context_manager(self):
        pass
--- a/tests/models/udop/test_modeling_udop.py
+++ b/tests/models/udop/test_modeling_udop.py
@@ -407,12 +407,6 @@ class UdopModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
            normalized_1 = F.softmax(out_shared_prefix_last_tokens)
            torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4)

-    @unittest.skip(
-        "Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!"
-    )
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
    @slow
    def test_model_from_pretrained(self):
        model_name = "microsoft/udop-large"
@@ -615,12 +609,6 @@ class UdopEncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase):
            normalized_1 = F.softmax(out_shared_prefix_last_tokens)
            torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4)

-    @unittest.skip(
-        "Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!"
-    )
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-

@require_torch
@require_sentencepiece
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -2431,7 +2431,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
        torch_dtype = torch.float16 if (torch.cuda.is_available() or is_torch_xpu_available()) else torch.float32
        model_id = "openai/whisper-large-v2"
        model = WhisperForConditionalGeneration.from_pretrained(
-            model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+            model_id, torch_dtype=torch_dtype, use_safetensors=True
        )
        model.to(torch_device)

@@ -2439,7 +2439,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):

        assistant_model_id = "distil-whisper/distil-large-v2"
        assistant_model = WhisperForCausalLM.from_pretrained(
-            assistant_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+            assistant_model_id, torch_dtype=torch_dtype, use_safetensors=True
        )
        assistant_model.to(torch_device)

@@ -2481,7 +2481,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
        torch_dtype = torch.float16 if torch_device in ["cuda", "xpu"] else torch.float32
        model_id = "openai/whisper-large-v2"
        model = WhisperForConditionalGeneration.from_pretrained(
-            model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+            model_id, torch_dtype=torch_dtype, use_safetensors=True
        )
        model.to(torch_device)

@@ -2489,7 +2489,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):

        assistant_model_id = "openai/whisper-tiny"
        assistant_model = WhisperForConditionalGeneration.from_pretrained(
-            assistant_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+            assistant_model_id, torch_dtype=torch_dtype, use_safetensors=True
        )
        assistant_model.to(torch_device)

--- a/tests/models/zamba/test_modeling_zamba.py
+++ b/tests/models/zamba/test_modeling_zamba.py
@@ -531,7 +531,6 @@ class ZambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
                    tmpdirname,
                    torch_dtype=torch.float16,
                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
                    load_in_4bit=True,
                )

@@ -565,9 +564,7 @@ class ZambaModelIntegrationTest(unittest.TestCase):
    @slow
    def setUpClass(cls):
        model_id = "Zyphra/Zamba-7B-v1"
-        cls.model = ZambaForCausalLM.from_pretrained(
-            model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, use_mamba_kernels=False
-        )
+        cls.model = ZambaForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, use_mamba_kernels=False)
        cls.tokenizer = AutoTokenizer.from_pretrained(model_id)

    @slow
--- a/tests/models/zamba2/test_modeling_zamba2.py
+++ b/tests/models/zamba2/test_modeling_zamba2.py
@@ -549,7 +549,6 @@ class Zamba2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
                    tmpdirname,
                    torch_dtype=torch.float16,
                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
                    load_in_4bit=True,
                )

@@ -610,9 +609,7 @@ class Zamba2ModelIntegrationTest(unittest.TestCase):
    @slow
    def setUpClass(cls):
        model_id = "Zyphra/Zamba2-1.2B"
-        cls.model = Zamba2ForCausalLM.from_pretrained(
-            model_id, torch_dtype=torch.float32, low_cpu_mem_usage=True, revision="PR"
-        )
+        cls.model = Zamba2ForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, revision="PR")
        cls.tokenizer = AutoTokenizer.from_pretrained(model_id, revision="PR")

    @parameterized.expand([(torch_device,), ("cpu",)])