switch to device agnostic device calling for test cases (#38247)
* use device agnostic APIs in test cases Signed-off-by: Matrix Yao <matrix.yao@intel.com> * fix style Signed-off-by: Matrix Yao <matrix.yao@intel.com> * add one more Signed-off-by: YAO Matrix <matrix.yao@intel.com> * xpu now supports integer device id, aligning to CUDA behaviors Signed-off-by: Matrix Yao <matrix.yao@intel.com> * update to use device_properties Signed-off-by: Matrix Yao <matrix.yao@intel.com> * fix style Signed-off-by: Matrix Yao <matrix.yao@intel.com> * update comment Signed-off-by: Matrix Yao <matrix.yao@intel.com> * fix comments Signed-off-by: Matrix Yao <matrix.yao@intel.com> * fix style Signed-off-by: Matrix Yao <matrix.yao@intel.com> --------- Signed-off-by: Matrix Yao <matrix.yao@intel.com> Signed-off-by: YAO Matrix <matrix.yao@intel.com> Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -22,6 +22,7 @@ from packaging import version
|
||||
|
||||
from transformers import AqlmConfig, AutoConfig, AutoModelForCausalLM, AutoTokenizer, OPTForCausalLM, StaticCache
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
require_accelerate,
|
||||
require_aqlm,
|
||||
require_torch_gpu,
|
||||
@@ -81,8 +82,6 @@ class AqlmTest(unittest.TestCase):
|
||||
|
||||
EXPECTED_OUTPUT = "Hello my name is Katie. I am a 20 year old college student. I am a very outgoing person. I love to have fun and be active. I"
|
||||
|
||||
device_map = "cuda"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
@@ -92,12 +91,12 @@ class AqlmTest(unittest.TestCase):
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
cls.model_name,
|
||||
device_map=cls.device_map,
|
||||
device_map=torch_device,
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_quantized_model_conversion(self):
|
||||
@@ -170,7 +169,7 @@ class AqlmTest(unittest.TestCase):
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=torch_device)
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@ import unittest
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AwqConfig, OPTForCausalLM
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
get_device_properties,
|
||||
require_accelerate,
|
||||
require_auto_awq,
|
||||
require_flash_attn,
|
||||
@@ -61,12 +62,10 @@ class AwqConfigTest(unittest.TestCase):
|
||||
|
||||
# Only cuda and xpu devices can run this function
|
||||
support_llm_awq = False
|
||||
if torch.cuda.is_available():
|
||||
compute_capability = torch.cuda.get_device_capability()
|
||||
major, minor = compute_capability
|
||||
if major >= 8:
|
||||
support_llm_awq = True
|
||||
elif torch.xpu.is_available():
|
||||
device_type, major = get_device_properties()
|
||||
if device_type == "cuda" and major >= 8:
|
||||
support_llm_awq = True
|
||||
elif device_type == "xpu":
|
||||
support_llm_awq = True
|
||||
|
||||
if support_llm_awq:
|
||||
@@ -357,7 +356,7 @@ class AwqFusedTest(unittest.TestCase):
|
||||
self.assertTrue(isinstance(model.model.layers[0].block_sparse_moe.gate, torch.nn.Linear))
|
||||
|
||||
@unittest.skipIf(
|
||||
torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8,
|
||||
get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8,
|
||||
"Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0",
|
||||
)
|
||||
@require_flash_attn
|
||||
@@ -388,7 +387,7 @@ class AwqFusedTest(unittest.TestCase):
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
@unittest.skipIf(
|
||||
torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8,
|
||||
get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8,
|
||||
"Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0",
|
||||
)
|
||||
def test_generation_fused_batched(self):
|
||||
@@ -441,7 +440,7 @@ class AwqFusedTest(unittest.TestCase):
|
||||
@require_flash_attn
|
||||
@require_torch_multi_gpu
|
||||
@unittest.skipIf(
|
||||
torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8,
|
||||
get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8,
|
||||
"Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0",
|
||||
)
|
||||
def test_generation_custom_model(self):
|
||||
|
||||
@@ -23,6 +23,7 @@ from transformers import (
|
||||
OPTForCausalLM,
|
||||
)
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
require_accelerate,
|
||||
require_torch_gpu,
|
||||
slow,
|
||||
@@ -56,7 +57,6 @@ class BitNetQuantConfigTest(unittest.TestCase):
|
||||
@require_accelerate
|
||||
class BitNetTest(unittest.TestCase):
|
||||
model_name = "HF1BitLLM/Llama3-8B-1.58-100B-tokens"
|
||||
device = "cuda"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
@@ -65,11 +65,11 @@ class BitNetTest(unittest.TestCase):
|
||||
Load the model
|
||||
"""
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(cls.model_name, device_map=cls.device)
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(cls.model_name, device_map=torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_replace_with_bitlinear(self):
|
||||
@@ -100,7 +100,7 @@ class BitNetTest(unittest.TestCase):
|
||||
"""
|
||||
input_text = "What are we having for dinner?"
|
||||
expected_output = "What are we having for dinner? What are we going to do for fun this weekend?"
|
||||
input_ids = self.tokenizer(input_text, return_tensors="pt").to("cuda")
|
||||
input_ids = self.tokenizer(input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = self.quantized_model.generate(**input_ids, max_new_tokens=11, do_sample=False)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
|
||||
@@ -127,7 +127,7 @@ class BitNetTest(unittest.TestCase):
|
||||
from transformers.integrations import BitLinear
|
||||
|
||||
layer = BitLinear(in_features=4, out_features=2, bias=False, dtype=torch.float32)
|
||||
layer.to(self.device)
|
||||
layer.to(torch_device)
|
||||
|
||||
input_tensor = torch.tensor([1.0, -1.0, -1.0, 1.0], dtype=torch.float32).to(torch_device)
|
||||
|
||||
@@ -202,9 +202,8 @@ class BitNetTest(unittest.TestCase):
|
||||
class BitNetSerializationTest(unittest.TestCase):
|
||||
def test_model_serialization(self):
|
||||
model_name = "HF1BitLLM/Llama3-8B-1.58-100B-tokens"
|
||||
device = "cuda"
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device)
|
||||
input_tensor = torch.zeros((1, 8), dtype=torch.int32, device=device)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=torch_device)
|
||||
input_tensor = torch.zeros((1, 8), dtype=torch.int32, device=torch_device)
|
||||
|
||||
with torch.no_grad():
|
||||
logits_ref = quantized_model.forward(input_tensor).logits
|
||||
@@ -215,10 +214,10 @@ class BitNetSerializationTest(unittest.TestCase):
|
||||
|
||||
# Remove old model
|
||||
del quantized_model
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
# Load and check if the logits match
|
||||
model_loaded = AutoModelForCausalLM.from_pretrained("quant_model", device_map=device)
|
||||
model_loaded = AutoModelForCausalLM.from_pretrained("quant_model", device_map=torch_device)
|
||||
|
||||
with torch.no_grad():
|
||||
logits_loaded = model_loaded.forward(input_tensor).logits
|
||||
|
||||
@@ -32,6 +32,7 @@ from transformers.models.opt.modeling_opt import OPTAttention
|
||||
from transformers.testing_utils import (
|
||||
apply_skip_if_not_implemented,
|
||||
backend_empty_cache,
|
||||
backend_torch_accelerator_module,
|
||||
is_bitsandbytes_available,
|
||||
is_torch_available,
|
||||
require_accelerate,
|
||||
@@ -376,7 +377,7 @@ class Bnb4BitT5Test(unittest.TestCase):
|
||||
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
|
||||
"""
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_inference_without_keep_in_fp32(self):
|
||||
r"""
|
||||
@@ -460,7 +461,7 @@ class Classes4BitModelTest(Base4bitTest):
|
||||
del self.seq_to_seq_model
|
||||
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_correct_head_class(self):
|
||||
r"""
|
||||
@@ -491,7 +492,7 @@ class Pipeline4BitTest(Base4bitTest):
|
||||
del self.pipe
|
||||
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_pipeline(self):
|
||||
r"""
|
||||
@@ -589,10 +590,10 @@ class Bnb4BitTestTraining(Base4bitTest):
|
||||
# Step 1: freeze all parameters
|
||||
model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
self.assertEqual(set(model.hf_device_map.values()), {torch.cuda.current_device()})
|
||||
elif torch.xpu.is_available():
|
||||
self.assertEqual(set(model.hf_device_map.values()), {f"xpu:{torch.xpu.current_device()}"})
|
||||
if torch_device in ["cuda", "xpu"]:
|
||||
self.assertEqual(
|
||||
set(model.hf_device_map.values()), {backend_torch_accelerator_module(torch_device).current_device()}
|
||||
)
|
||||
else:
|
||||
self.assertTrue(all(param.device.type == "cpu" for param in model.parameters()))
|
||||
|
||||
|
||||
@@ -31,6 +31,8 @@ from transformers import (
|
||||
from transformers.models.opt.modeling_opt import OPTAttention
|
||||
from transformers.testing_utils import (
|
||||
apply_skip_if_not_implemented,
|
||||
backend_empty_cache,
|
||||
backend_torch_accelerator_module,
|
||||
is_accelerate_available,
|
||||
is_bitsandbytes_available,
|
||||
is_torch_available,
|
||||
@@ -137,7 +139,7 @@ class MixedInt8Test(BaseMixedInt8Test):
|
||||
del self.model_8bit
|
||||
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_get_keys_to_not_convert(self):
|
||||
r"""
|
||||
@@ -484,7 +486,7 @@ class MixedInt8T5Test(unittest.TestCase):
|
||||
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
|
||||
"""
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_inference_without_keep_in_fp32(self):
|
||||
r"""
|
||||
@@ -599,7 +601,7 @@ class MixedInt8ModelClassesTest(BaseMixedInt8Test):
|
||||
del self.seq_to_seq_model
|
||||
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_correct_head_class(self):
|
||||
r"""
|
||||
@@ -631,7 +633,7 @@ class MixedInt8TestPipeline(BaseMixedInt8Test):
|
||||
del self.pipe
|
||||
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_pipeline(self):
|
||||
r"""
|
||||
@@ -872,10 +874,10 @@ class MixedInt8TestTraining(BaseMixedInt8Test):
|
||||
model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_8bit=True)
|
||||
model.train()
|
||||
|
||||
if torch.cuda.is_available():
|
||||
self.assertEqual(set(model.hf_device_map.values()), {torch.cuda.current_device()})
|
||||
elif torch.xpu.is_available():
|
||||
self.assertEqual(set(model.hf_device_map.values()), {f"xpu:{torch.xpu.current_device()}"})
|
||||
if torch_device in ["cuda", "xpu"]:
|
||||
self.assertEqual(
|
||||
set(model.hf_device_map.values()), {backend_torch_accelerator_module(torch_device).current_device()}
|
||||
)
|
||||
else:
|
||||
self.assertTrue(all(param.device.type == "cpu" for param in model.parameters()))
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
import warnings
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from transformers.testing_utils import require_compressed_tensors, require_torch
|
||||
from transformers.testing_utils import backend_empty_cache, require_compressed_tensors, require_torch, torch_device
|
||||
from transformers.utils import is_torch_available
|
||||
from transformers.utils.quantization_config import CompressedTensorsConfig
|
||||
|
||||
@@ -41,7 +41,7 @@ class StackCompressedModelTest(unittest.TestCase):
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_compressed_uncompressed_model_shapes(self):
|
||||
@@ -160,7 +160,7 @@ class RunCompressedTest(unittest.TestCase):
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_default_run_compressed__True(self):
|
||||
|
||||
@@ -2,7 +2,7 @@ import gc
|
||||
import unittest
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, CompressedTensorsConfig
|
||||
from transformers.testing_utils import require_compressed_tensors, require_torch
|
||||
from transformers.testing_utils import backend_empty_cache, require_compressed_tensors, require_torch, torch_device
|
||||
from transformers.utils import is_torch_available
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ class CompressedTensorsTest(unittest.TestCase):
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_config_args(self):
|
||||
|
||||
@@ -18,6 +18,7 @@ import unittest
|
||||
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, EetqConfig, OPTForCausalLM
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
require_accelerate,
|
||||
require_eetq,
|
||||
require_torch_gpu,
|
||||
@@ -87,7 +88,7 @@ class EetqTest(unittest.TestCase):
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_quantized_model_conversion(self):
|
||||
|
||||
@@ -18,6 +18,7 @@ import unittest
|
||||
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, FbgemmFp8Config, OPTForCausalLM
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
require_accelerate,
|
||||
require_fbgemm_gpu,
|
||||
require_read_token,
|
||||
@@ -126,7 +127,7 @@ class FbgemmFp8Test(unittest.TestCase):
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_quantized_model_conversion(self):
|
||||
|
||||
@@ -19,6 +19,7 @@ import unittest
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, FineGrainedFP8Config, OPTForCausalLM
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
get_device_properties,
|
||||
require_accelerate,
|
||||
require_read_token,
|
||||
require_torch_accelerator,
|
||||
@@ -254,7 +255,7 @@ class FP8LinearTest(unittest.TestCase):
|
||||
device = torch_device
|
||||
|
||||
@unittest.skipIf(
|
||||
torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 9,
|
||||
get_device_properties()[0] == "cuda" and get_device_properties()[1] < 9,
|
||||
"Skipping FP8LinearTest because it is not supported on GPU with capability < 9.0",
|
||||
)
|
||||
def test_linear_preserves_shape(self):
|
||||
@@ -270,7 +271,7 @@ class FP8LinearTest(unittest.TestCase):
|
||||
self.assertEqual(x_.shape, x.shape)
|
||||
|
||||
@unittest.skipIf(
|
||||
torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 9,
|
||||
get_device_properties()[0] == "cuda" and get_device_properties()[1] < 9,
|
||||
"Skipping FP8LinearTest because it is not supported on GPU with capability < 9.0",
|
||||
)
|
||||
def test_linear_with_diff_feature_size_preserves_shape(self):
|
||||
|
||||
@@ -18,6 +18,7 @@ import unittest
|
||||
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, HiggsConfig, OPTForCausalLM
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
require_accelerate,
|
||||
require_flute_hadamard,
|
||||
require_torch_gpu,
|
||||
@@ -87,7 +88,7 @@ class HiggsTest(unittest.TestCase):
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_quantized_model_conversion(self):
|
||||
|
||||
@@ -17,6 +17,7 @@ import unittest
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
require_accelerate,
|
||||
require_hqq,
|
||||
require_torch_gpu,
|
||||
@@ -50,7 +51,7 @@ class HQQLLMRunner:
|
||||
|
||||
|
||||
def cleanup():
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
|
||||
@@ -187,7 +188,7 @@ class HQQTestBias(unittest.TestCase):
|
||||
hqq_runner.model.save_pretrained(tmpdirname)
|
||||
|
||||
del hqq_runner.model
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
model_loaded = AutoModelForCausalLM.from_pretrained(
|
||||
tmpdirname, torch_dtype=torch.float16, device_map=torch_device
|
||||
@@ -228,7 +229,7 @@ class HQQSerializationTest(unittest.TestCase):
|
||||
|
||||
# Remove old model
|
||||
del hqq_runner.model
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
# Load and check if the logits match
|
||||
model_loaded = AutoModelForCausalLM.from_pretrained(
|
||||
|
||||
@@ -18,6 +18,7 @@ import unittest
|
||||
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, SpQRConfig, StaticCache
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
require_accelerate,
|
||||
require_spqr,
|
||||
require_torch_gpu,
|
||||
@@ -82,8 +83,6 @@ class SpQRTest(unittest.TestCase):
|
||||
)
|
||||
EXPECTED_OUTPUT_COMPILE = "Hello my name is Jake and I am a 20 year old student at the University of North Texas. (Go Mean Green!) I am a huge fan of the Dallas"
|
||||
|
||||
device_map = "cuda"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
@@ -93,12 +92,12 @@ class SpQRTest(unittest.TestCase):
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
cls.model_name,
|
||||
device_map=cls.device_map,
|
||||
device_map=torch_device,
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_quantized_model_conversion(self):
|
||||
@@ -158,7 +157,7 @@ class SpQRTest(unittest.TestCase):
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=torch_device)
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
|
||||
@@ -21,10 +21,13 @@ from packaging import version
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
get_device_properties,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_gpu,
|
||||
require_torchao,
|
||||
require_torchao_version_greater_or_equal,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils import is_torch_available, is_torchao_available
|
||||
|
||||
@@ -131,7 +134,7 @@ class TorchAoTest(unittest.TestCase):
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_int4wo_quant(self):
|
||||
@@ -260,7 +263,7 @@ class TorchAoTest(unittest.TestCase):
|
||||
|
||||
@require_torch_gpu
|
||||
class TorchAoGPUTest(TorchAoTest):
|
||||
device = "cuda"
|
||||
device = torch_device
|
||||
quant_scheme_kwargs = {"group_size": 32}
|
||||
|
||||
def test_int4wo_offload(self):
|
||||
@@ -397,7 +400,7 @@ class TorchAoSerializationTest(unittest.TestCase):
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_original_model_expected_output(self):
|
||||
@@ -452,33 +455,33 @@ class TorchAoSerializationW8CPUTest(TorchAoSerializationTest):
|
||||
@require_torch_gpu
|
||||
class TorchAoSerializationGPTTest(TorchAoSerializationTest):
|
||||
quant_scheme, quant_scheme_kwargs = "int4_weight_only", {"group_size": 32}
|
||||
device = "cuda:0"
|
||||
device = f"{torch_device}:0"
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
class TorchAoSerializationW8A8GPUTest(TorchAoSerializationTest):
|
||||
quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {}
|
||||
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
device = "cuda:0"
|
||||
device = f"{torch_device}:0"
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
class TorchAoSerializationW8GPUTest(TorchAoSerializationTest):
|
||||
quant_scheme, quant_scheme_kwargs = "int8_weight_only", {}
|
||||
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
device = "cuda:0"
|
||||
device = f"{torch_device}:0"
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
@require_torchao_version_greater_or_equal("0.10.0")
|
||||
class TorchAoSerializationFP8GPUTest(TorchAoSerializationTest):
|
||||
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
device = "cuda:0"
|
||||
device = f"{torch_device}:0"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
if not torch.cuda.is_available() or torch.cuda.get_device_capability()[0] < 9:
|
||||
if not (get_device_properties()[0] == "cuda" and get_device_properties()[1] >= 9):
|
||||
raise unittest.SkipTest("CUDA compute capability 9.0 or higher required for FP8 tests")
|
||||
|
||||
from torchao.quantization import Float8WeightOnlyConfig
|
||||
@@ -493,12 +496,12 @@ class TorchAoSerializationFP8GPUTest(TorchAoSerializationTest):
|
||||
@require_torchao_version_greater_or_equal("0.10.0")
|
||||
class TorchAoSerializationA8W4Test(TorchAoSerializationTest):
|
||||
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
device = "cuda:0"
|
||||
device = f"{torch_device}:0"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
if not torch.cuda.is_available() or torch.cuda.get_device_capability()[0] < 9:
|
||||
if not (get_device_properties()[0] == "cuda" and get_device_properties()[1] >= 9):
|
||||
raise unittest.SkipTest("CUDA compute capability 9.0 or higher required for FP8 tests")
|
||||
|
||||
from torchao.quantization import Int8DynamicActivationInt4WeightConfig
|
||||
|
||||
@@ -18,6 +18,7 @@ import unittest
|
||||
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, VptqConfig
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
require_accelerate,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_gpu,
|
||||
@@ -74,7 +75,7 @@ class VptqTest(unittest.TestCase):
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_quantized_model(self):
|
||||
|
||||
Reference in New Issue
Block a user