[tests] make test_from_pretrained_low_cpu_mem_usage_equal less flaky (#36255)

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Joao Gante
2025-02-19 15:14:02 +00:00
committed by GitHub
parent 99adc74462
commit e3d99ec2f5

View File

@@ -993,6 +993,7 @@ class ModelUtilsTest(TestCasePlus):
for mname in mnames:
_ = BertModel.from_pretrained(mname, low_cpu_mem_usage=True)
@slow
@require_usr_bin_time
@require_accelerate
@mark.accelerate_tests
@@ -1001,30 +1002,29 @@ class ModelUtilsTest(TestCasePlus):
# Now though these should be around the same.
# TODO: Look for good bounds to check that their timings are near the same
mname = "hf-internal-testing/tiny-random-bert"
mname = "HuggingFaceTB/SmolLM-135M"
preamble = "from transformers import AutoModel"
one_liner_str = f'{preamble}; AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=False)'
# Save this output as `max_rss_normal` if testing memory results
max_rss_normal = self.python_one_liner_max_rss(one_liner_str)
# print(f"{max_rss_normal=}")
one_liner_str = f'{preamble}; AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=True)'
# Save this output as `max_rss_low_mem` if testing memory results
max_rss_low_mem = self.python_one_liner_max_rss(one_liner_str)
# Should be within 2MBs of each other (overhead)
# Should be within 5MBs of each other (overhead)
self.assertAlmostEqual(
max_rss_normal / 1024 / 1024,
max_rss_low_mem / 1024 / 1024,
delta=2,
delta=5,
msg="using `low_cpu_mem_usage` should incur the same memory usage in both cases.",
)
# if you want to compare things manually, let's first look at the size of the model in bytes
# model = BertModel.from_pretrained(mname, low_cpu_mem_usage=False)
# model = AutoModel.from_pretrained(mname, low_cpu_mem_usage=False)
# total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
# total_bytes = total_numel * 4 # 420MB
# total_bytes = total_numel * 4
# Now the diff_bytes should be very close to total_bytes, but the reports are inconsistent.
# The easiest way to test this is to switch the model and torch.load to do all the work on
# gpu - that way one can measure exactly the total and peak memory used. Perhaps once we add