Remove all traces of low_cpu_mem_usage (#38792)

* remove it from all py files

* remove it from the doc

* remove it from examples

* style

* remove traces of _fast_init

* Update test_peft_integration.py

* CIs
This commit is contained in:
Cyril Vallez
2025-06-12 16:39:33 +02:00
committed by GitHub
parent 3542e0b844
commit 4b8ec667e9
76 changed files with 100 additions and 598 deletions

View File

@@ -126,9 +126,7 @@ class GemmaIntegrationTest(unittest.TestCase):
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
]
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to(
torch_device
)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device)
model.generation_config.cache_implementation = "static"
@@ -149,9 +147,7 @@ class GemmaIntegrationTest(unittest.TestCase):
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
]
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
torch_device
)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -171,9 +167,7 @@ class GemmaIntegrationTest(unittest.TestCase):
]
# bfloat16 gives strange values, likely due to it has lower precision + very short prompts
model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager"
)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, attn_implementation="eager")
model.to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -195,7 +189,7 @@ class GemmaIntegrationTest(unittest.TestCase):
]
model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
model_id, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
)
model.to(torch_device)
@@ -216,7 +210,7 @@ class GemmaIntegrationTest(unittest.TestCase):
"Hi today I'd like to share with you my experience with the new wattpad wattpad wattpad wattpad wattpad wattpad wattpad",
]
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -235,7 +229,7 @@ class GemmaIntegrationTest(unittest.TestCase):
"Hi,\n\nI have a problem with my 2005 1.6 16",
]
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(torch_device)
model = AutoModelForCausalLM.from_pretrained(model_id).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -256,9 +250,7 @@ class GemmaIntegrationTest(unittest.TestCase):
"Hi today I am going to show you how to make a simple and easy to make a DIY 3D",
]
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to(
torch_device
)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -290,9 +282,7 @@ class GemmaIntegrationTest(unittest.TestCase):
# fmt: on
expected_text = EXPECTED_TEXTS.get_expectation()
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
torch_device
)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -312,9 +302,7 @@ class GemmaIntegrationTest(unittest.TestCase):
"Hi today I am going to show you how to make a simple and easy to make a DIY 3D",
]
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to(
torch_device
)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device)
model.generation_config.cache_implementation = "static"
@@ -333,7 +321,7 @@ class GemmaIntegrationTest(unittest.TestCase):
"Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very",
]
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
@@ -451,9 +439,7 @@ class GemmaIntegrationTest(unittest.TestCase):
"Hi today we have the review for a <strong>2016/2017</strong> season of",
]
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
torch_device
)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)