From 4dbf17c17f5834eb68f296457acc605a8c533b5a Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Mon, 24 Feb 2025 18:30:15 +0800 Subject: [PATCH] [tests] enable bnb tests on xpu (#36233) * fix failed test * fix device * fix more device cases * add more cases * fix empty cache * Update test_4bit.py --------- Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> --- tests/models/falcon/test_modeling_falcon.py | 4 +-- .../peft_integration/test_peft_integration.py | 5 ++-- tests/quantization/bnb/test_4bit.py | 25 +++++++++++-------- tests/quantization/bnb/test_mixed_int8.py | 14 ++++++----- 4 files changed, 27 insertions(+), 21 deletions(-) diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py index 2838df3807..a024b801e8 100644 --- a/tests/models/falcon/test_modeling_falcon.py +++ b/tests/models/falcon/test_modeling_falcon.py @@ -591,12 +591,12 @@ class FalconLanguageGenerationTest(unittest.TestCase): test_text = "A sequence: 1, 2" # should generate the rest of the sequence - unpadded_inputs = tokenizer([test_text], return_tensors="pt").to("cuda:0") + unpadded_inputs = tokenizer([test_text], return_tensors="pt").to(f"{torch_device}:0") unpadded_gen_out = model.generate(**unpadded_inputs, max_new_tokens=20) unpadded_gen_text = tokenizer.batch_decode(unpadded_gen_out, skip_special_tokens=True) dummy_text = "This is a longer text " * 2 # forces left-padding on `test_text` - padded_inputs = tokenizer([test_text, dummy_text], return_tensors="pt", padding=True).to("cuda:0") + padded_inputs = tokenizer([test_text, dummy_text], return_tensors="pt", padding=True).to(f"{torch_device}:0") padded_gen_out = model.generate(**padded_inputs, max_new_tokens=20) padded_gen_text = tokenizer.batch_decode(padded_gen_out, skip_special_tokens=True) diff --git a/tests/peft_integration/test_peft_integration.py b/tests/peft_integration/test_peft_integration.py index 61b60901ca..f48584d612 100644 --- a/tests/peft_integration/test_peft_integration.py +++ b/tests/peft_integration/test_peft_integration.py @@ -35,6 +35,7 @@ from transformers.testing_utils import ( require_bitsandbytes, require_peft, require_torch, + require_torch_accelerator, require_torch_gpu, slow, torch_device, @@ -440,7 +441,7 @@ class PeftIntegrationTester(unittest.TestCase, PeftTesterMixin): # dummy generation _ = peft_model.generate(input_ids=torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7]]).to(torch_device)) - @require_torch_gpu + @require_torch_accelerator @require_bitsandbytes def test_peft_save_quantized(self): """ @@ -479,7 +480,7 @@ class PeftIntegrationTester(unittest.TestCase, PeftTesterMixin): self.assertTrue("pytorch_model.bin" not in os.listdir(tmpdirname)) self.assertTrue("model.safetensors" not in os.listdir(tmpdirname)) - @require_torch_gpu + @require_torch_accelerator @require_bitsandbytes def test_peft_save_quantized_regression(self): """ diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py index f7e3c83829..ea4d87482b 100644 --- a/tests/quantization/bnb/test_4bit.py +++ b/tests/quantization/bnb/test_4bit.py @@ -32,6 +32,7 @@ from transformers import ( from transformers.models.opt.modeling_opt import OPTAttention from transformers.testing_utils import ( apply_skip_if_not_implemented, + backend_empty_cache, is_bitsandbytes_available, is_torch_available, require_accelerate, @@ -136,7 +137,7 @@ class Bnb4BitTest(Base4bitTest): del self.model_4bit gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_quantization_num_parameters(self): r""" @@ -224,7 +225,7 @@ class Bnb4BitTest(Base4bitTest): """ encoded_input = self.tokenizer(self.input_text, return_tensors="pt") output_sequences = self.model_4bit.generate( - input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10 + input_ids=encoded_input["input_ids"].to(self.model_4bit.device), max_new_tokens=10 ) self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) @@ -242,7 +243,7 @@ class Bnb4BitTest(Base4bitTest): encoded_input = self.tokenizer(self.input_text, return_tensors="pt") output_sequences = model_4bit_from_config.generate( - input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10 + input_ids=encoded_input["input_ids"].to(model_4bit_from_config.device), max_new_tokens=10 ) self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) @@ -261,7 +262,7 @@ class Bnb4BitTest(Base4bitTest): encoded_input = self.tokenizer(self.input_text, return_tensors="pt") output_sequences = model_4bit.generate( - input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10 + input_ids=encoded_input["input_ids"].to(model_4bit.device), max_new_tokens=10 ) self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) @@ -277,10 +278,10 @@ class Bnb4BitTest(Base4bitTest): self.assertEqual(self.model_4bit.device.type, "cpu") self.assertAlmostEqual(self.model_4bit.get_memory_footprint(), mem_before) - if torch.cuda.is_available(): + if torch_device in ["cuda", "xpu"]: # Move back to CUDA device - self.model_4bit.to("cuda") - self.assertEqual(self.model_4bit.device.type, "cuda") + self.model_4bit.to(torch_device) + self.assertEqual(self.model_4bit.device.type, torch_device) self.assertAlmostEqual(self.model_4bit.get_memory_footprint(), mem_before) def test_device_and_dtype_assignment(self): @@ -323,11 +324,13 @@ class Bnb4BitTest(Base4bitTest): encoded_input = self.tokenizer(self.input_text, return_tensors="pt") self.model_fp16 = self.model_fp16.to(torch.float32) - _ = self.model_fp16.generate(input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10) + _ = self.model_fp16.generate( + input_ids=encoded_input["input_ids"].to(self.model_fp16.device), max_new_tokens=10 + ) - if torch.cuda.is_available(): + if torch_device in ["cuda", "xpu"]: # Check that this does not throw an error - _ = self.model_fp16.cuda() + _ = self.model_fp16.to(torch_device) # Check this does not throw an error _ = self.model_fp16.to("cpu") @@ -617,7 +620,7 @@ class BaseSerializationTest(unittest.TestCase): def tearDown(self): gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_serialization(self, quant_type="nf4", double_quant=True, safe_serialization=True): r""" diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py index e73dd82f34..634a2eb16b 100644 --- a/tests/quantization/bnb/test_mixed_int8.py +++ b/tests/quantization/bnb/test_mixed_int8.py @@ -274,7 +274,7 @@ class MixedInt8Test(BaseMixedInt8Test): """ encoded_input = self.tokenizer(self.input_text, return_tensors="pt") output_sequences = self.model_8bit.generate( - input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10 + input_ids=encoded_input["input_ids"].to(self.model_8bit.device), max_new_tokens=10 ) self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) @@ -292,7 +292,7 @@ class MixedInt8Test(BaseMixedInt8Test): encoded_input = self.tokenizer(self.input_text, return_tensors="pt") output_sequences = model_8bit_from_config.generate( - input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10 + input_ids=encoded_input["input_ids"].to(model_8bit_from_config.device), max_new_tokens=10 ) self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) @@ -311,7 +311,7 @@ class MixedInt8Test(BaseMixedInt8Test): encoded_input = self.tokenizer(self.input_text, return_tensors="pt") output_sequences = model_8bit.generate( - input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10 + input_ids=encoded_input["input_ids"].to(model_8bit.device), max_new_tokens=10 ) self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) @@ -362,7 +362,9 @@ class MixedInt8Test(BaseMixedInt8Test): encoded_input = self.tokenizer(self.input_text, return_tensors="pt") self.model_fp16 = self.model_fp16.to(torch.float32) - _ = self.model_fp16.generate(input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10) + _ = self.model_fp16.generate( + input_ids=encoded_input["input_ids"].to(self.model_fp16.device), max_new_tokens=10 + ) # Check this does not throw an error _ = self.model_fp16.to("cpu") @@ -402,7 +404,7 @@ class MixedInt8Test(BaseMixedInt8Test): # generate encoded_input = self.tokenizer(self.input_text, return_tensors="pt") output_sequences = model_from_saved.generate( - input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10 + input_ids=encoded_input["input_ids"].to(model_from_saved.device), max_new_tokens=10 ) self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) @@ -429,7 +431,7 @@ class MixedInt8Test(BaseMixedInt8Test): # generate encoded_input = self.tokenizer(self.input_text, return_tensors="pt") output_sequences = model_from_saved.generate( - input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10 + input_ids=encoded_input["input_ids"].to(model_from_saved.device), max_new_tokens=10 ) self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)