Remove all traces of low_cpu_mem_usage (#38792)

* remove it from all py files * remove it from the doc * remove it from examples * style * remove traces of _fast_init * Update test_peft_integration.py * CIs
2025-06-12 16:39:33 +02:00
parent 3542e0b844
commit 4b8ec667e9
76 changed files with 100 additions and 598 deletions
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -126,9 +126,7 @@ class GemmaIntegrationTest(unittest.TestCase):
            "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
        ]

-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to(
-            torch_device
-        )
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device)

        model.generation_config.cache_implementation = "static"

@@ -149,9 +147,7 @@ class GemmaIntegrationTest(unittest.TestCase):
            "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
        ]

-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
-            torch_device
-        )
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -171,9 +167,7 @@ class GemmaIntegrationTest(unittest.TestCase):
        ]

        # bfloat16 gives strange values, likely due to it has lower precision + very short prompts
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager"
-        )
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, attn_implementation="eager")
        model.to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -195,7 +189,7 @@ class GemmaIntegrationTest(unittest.TestCase):
        ]

        model = AutoModelForCausalLM.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+            model_id, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
        )
        model.to(torch_device)

@@ -216,7 +210,7 @@ class GemmaIntegrationTest(unittest.TestCase):
            "Hi today I'd like to share with you my experience with the new wattpad wattpad wattpad wattpad wattpad wattpad wattpad",
        ]

-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True)
+        model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -235,7 +229,7 @@ class GemmaIntegrationTest(unittest.TestCase):
            "Hi,\n\nI have a problem with my 2005 1.6 16",
        ]

-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(torch_device)
+        model = AutoModelForCausalLM.from_pretrained(model_id).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -256,9 +250,7 @@ class GemmaIntegrationTest(unittest.TestCase):
            "Hi today I am going to show you how to make a simple and easy to make a DIY 3D",
        ]

-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to(
-            torch_device
-        )
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -290,9 +282,7 @@ class GemmaIntegrationTest(unittest.TestCase):
        # fmt: on
        expected_text = EXPECTED_TEXTS.get_expectation()

-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
-            torch_device
-        )
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -312,9 +302,7 @@ class GemmaIntegrationTest(unittest.TestCase):
            "Hi today I am going to show you how to make a simple and easy to make a DIY 3D",
        ]

-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to(
-            torch_device
-        )
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device)

        model.generation_config.cache_implementation = "static"

@@ -333,7 +321,7 @@ class GemmaIntegrationTest(unittest.TestCase):
            "Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very",
        ]

-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True)
+        model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -451,9 +439,7 @@ class GemmaIntegrationTest(unittest.TestCase):
            "Hi today we have the review for a <strong>2016/2017</strong> season of",
        ]

-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
-            torch_device
-        )
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)

        tokenizer = AutoTokenizer.from_pretrained(model_id)
        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)