Remove all traces of low_cpu_mem_usage (#38792)
* remove it from all py files * remove it from the doc * remove it from examples * style * remove traces of _fast_init * Update test_peft_integration.py * CIs
This commit is contained in:
@@ -64,7 +64,6 @@ from transformers.testing_utils import (
|
||||
require_torch,
|
||||
require_torch_accelerator,
|
||||
require_torch_multi_accelerator,
|
||||
require_usr_bin_time,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
@@ -1003,57 +1002,6 @@ class ModelUtilsTest(TestCasePlus):
|
||||
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
@require_accelerate
|
||||
@mark.accelerate_tests
|
||||
def test_from_pretrained_low_cpu_mem_usage_functional(self):
|
||||
# test that we can use `from_pretrained(..., low_cpu_mem_usage=True)` with normal and
|
||||
# sharded models
|
||||
|
||||
mnames = [
|
||||
"hf-internal-testing/tiny-random-bert-sharded",
|
||||
"hf-internal-testing/tiny-random-bert",
|
||||
]
|
||||
for mname in mnames:
|
||||
_ = BertModel.from_pretrained(mname, low_cpu_mem_usage=True)
|
||||
|
||||
@slow
|
||||
@require_usr_bin_time
|
||||
@require_accelerate
|
||||
@mark.accelerate_tests
|
||||
def test_from_pretrained_low_cpu_mem_usage_equal(self):
|
||||
# Before this would test that `from_pretrained(..., low_cpu_mem_usage=True)` uses less cpu memory than default
|
||||
# Now though these should be around the same.
|
||||
# TODO: Look for good bounds to check that their timings are near the same
|
||||
|
||||
mname = "HuggingFaceTB/SmolLM-135M"
|
||||
|
||||
preamble = "from transformers import AutoModel"
|
||||
one_liner_str = f'{preamble}; AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=False)'
|
||||
# Save this output as `max_rss_normal` if testing memory results
|
||||
max_rss_normal = self.python_one_liner_max_rss(one_liner_str)
|
||||
|
||||
one_liner_str = f'{preamble}; AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=True)'
|
||||
# Save this output as `max_rss_low_mem` if testing memory results
|
||||
max_rss_low_mem = self.python_one_liner_max_rss(one_liner_str)
|
||||
|
||||
# Should be within 5MBs of each other (overhead)
|
||||
self.assertAlmostEqual(
|
||||
max_rss_normal / 1024 / 1024,
|
||||
max_rss_low_mem / 1024 / 1024,
|
||||
delta=5,
|
||||
msg="using `low_cpu_mem_usage` should incur the same memory usage in both cases.",
|
||||
)
|
||||
|
||||
# if you want to compare things manually, let's first look at the size of the model in bytes
|
||||
# model = AutoModel.from_pretrained(mname, low_cpu_mem_usage=False)
|
||||
# total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
|
||||
# total_bytes = total_numel * 4
|
||||
# Now the diff_bytes should be very close to total_bytes, but the reports are inconsistent.
|
||||
# The easiest way to test this is to switch the model and torch.load to do all the work on
|
||||
# gpu - that way one can measure exactly the total and peak memory used. Perhaps once we add
|
||||
# functionality to load models directly on gpu, this test can be rewritten to use torch's
|
||||
# cuda memory tracking and then we should be able to do a much more precise test.
|
||||
|
||||
@require_accelerate
|
||||
@mark.accelerate_tests
|
||||
@require_torch_multi_accelerator
|
||||
@@ -1537,7 +1485,6 @@ class ModelUtilsTest(TestCasePlus):
|
||||
config=model_config,
|
||||
ignore_mismatched_sizes=True,
|
||||
torch_dtype=torch.float16,
|
||||
low_cpu_mem_usage=True,
|
||||
)
|
||||
model_ref = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id)
|
||||
|
||||
@@ -1782,16 +1729,6 @@ class ModelUtilsTest(TestCasePlus):
|
||||
)
|
||||
self.assertTrue(check_models_equal(model, model_loaded))
|
||||
|
||||
def test_load_model_with_state_dict_only_low_cpu_mem_usage(self):
|
||||
model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
|
||||
state_dict = model.state_dict()
|
||||
config = model.config
|
||||
|
||||
model_loaded = BertModel.from_pretrained(
|
||||
pretrained_model_name_or_path=None, config=config, state_dict=state_dict, low_cpu_mem_usage=True
|
||||
)
|
||||
self.assertTrue(check_models_equal(model, model_loaded))
|
||||
|
||||
def test_cache_when_needed_at_train_time(self):
|
||||
"""
|
||||
Some fine-tuning methods require the use of cache, like prefix tuning in PEFT. This test checks that a cache
|
||||
|
||||
Reference in New Issue
Block a user