From 2a004f9ff1a8f118318f0f82f64b74a2655d26d0 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Thu, 13 Mar 2025 17:07:30 +0100 Subject: [PATCH] Add loading speed test (#36671) * Update test_modeling_utils.py * Update test_modeling_utils.py * Update test_modeling_utils.py * Update test_modeling_utils.py * Update test_modeling_utils.py * Update test_modeling_utils.py * trigger CIs * Update test_modeling_utils.py * Update test_modeling_utils.py * Update test_modeling_utils.py * better error messages * Update test_modeling_utils.py * Update test_modeling_utils.py --- tests/utils/test_modeling_utils.py | 60 ++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py index 7d69073147..c51ca2c438 100644 --- a/tests/utils/test_modeling_utils.py +++ b/tests/utils/test_modeling_utils.py @@ -17,8 +17,10 @@ import glob import json import os import os.path +import subprocess import sys import tempfile +import textwrap import threading import unittest import unittest.mock as mock @@ -28,6 +30,7 @@ from pathlib import Path import requests from huggingface_hub import HfApi, HfFolder +from parameterized import parameterized from pytest import mark from requests.exceptions import HTTPError @@ -55,10 +58,12 @@ from transformers.testing_utils import ( is_staging_test, require_accelerate, require_flax, + require_read_token, require_safetensors, require_tf, require_torch, require_torch_accelerator, + require_torch_gpu, require_torch_multi_accelerator, require_usr_bin_time, slow, @@ -1900,6 +1905,61 @@ class ModelUtilsTest(TestCasePlus): self.assertEqual(len(cm.records), 1) self.assertTrue(cm.records[0].message.startswith("Unknown quantization type, got")) + @parameterized.expand([("Qwen/Qwen2.5-3B-Instruct", 10), ("meta-llama/Llama-2-7b-chat-hf", 10)]) + @slow + @require_read_token + @require_torch_gpu + def test_loading_is_fast_on_gpu(self, model_id: str, max_loading_time: float): + """ + This test is used to avoid regresion on https://github.com/huggingface/transformers/pull/36380. + 10s should be more than enough for both models, and allows for some margin as loading time are quite + unstable. Before #36380, it used to take more than 40s, so 10s is still reasonable. + Note that we run this test in a subprocess, to ensure that cuda is not already initialized/warmed-up. + """ + # First download the weights if not already on disk + _ = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16) + + script_to_run = textwrap.dedent( + """ + import torch + import time + import argparse + from transformers import AutoModelForCausalLM + + parser = argparse.ArgumentParser() + parser.add_argument("model_id", type=str) + parser.add_argument("max_loading_time", type=float) + args = parser.parse_args() + + device = torch.device("cuda:0") + + torch.cuda.synchronize(device) + t0 = time.time() + model = AutoModelForCausalLM.from_pretrained(args.model_id, torch_dtype=torch.float16, device_map=device) + torch.cuda.synchronize(device) + dt = time.time() - t0 + + # Assert loading is faster (it should be more than enough in both cases) + if dt > args.max_loading_time: + raise ValueError(f"Loading took {dt:.2f}s! It should not take more than {args.max_loading_time}s") + # Ensure everything is correctly loaded on gpu + bad_device_params = {k for k, v in model.named_parameters() if v.device != device} + if len(bad_device_params) > 0: + raise ValueError(f"The following parameters are not on GPU: {bad_device_params}") + """ + ) + + with tempfile.NamedTemporaryFile(mode="w+", suffix=".py") as tmp: + tmp.write(script_to_run) + tmp.flush() + tmp.seek(0) + cmd = f"python {tmp.name} {model_id} {max_loading_time}".split() + try: + # We cannot use a timeout of `max_loading_time` as cuda initialization can take up to 15-20s + _ = subprocess.run(cmd, capture_output=True, env=self.get_env(), text=True, check=True, timeout=60) + except subprocess.CalledProcessError as e: + raise Exception(f"The following error was captured: {e.stderr}") + @slow @require_torch