From 27361bd218e5d1ecffc68b9317c49d3a764dfbba Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Mon, 17 Mar 2025 22:57:49 +0800 Subject: [PATCH] fix xpu tests (#36656) * fix awq xpu tests Signed-off-by: jiqing-feng * update Signed-off-by: jiqing-feng * fix llava next video bnb tests Signed-off-by: jiqing-feng --------- Signed-off-by: jiqing-feng Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- src/transformers/utils/quantization_config.py | 3 +++ .../test_modeling_llava_next_video.py | 10 +++++----- tests/quantization/autoawq/test_awq.py | 17 +++++++++++------ 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 0988d8ac14..851249b270 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -902,6 +902,9 @@ class AwqConfig(QuantizationConfigMixin): ) if self.backend == AwqBackendPackingMethod.LLMAWQ: + # Only cuda device can run this function + if not torch.cuda.is_available(): + raise ValueError("LLM-AWQ backend is only supported on CUDA") compute_capability = torch.cuda.get_device_capability() major, minor = compute_capability if major < 8: diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py index 44c7deeffc..ba7323a075 100644 --- a/tests/models/llava_next_video/test_modeling_llava_next_video.py +++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -430,13 +430,13 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase): # verify generation output = model.generate(**inputs, do_sample=False, max_new_tokens=40) - EXPECTED_DECODED_TEXT = 'USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and somewhat comical situation of a young child reading a book while another child is attempting to read the same book. The child who is reading the book seems' # fmt: skip - - self.assertEqual( - self.processor.decode(output[0], skip_special_tokens=True), - EXPECTED_DECODED_TEXT, + EXPECTED_DECODED_TEXT = ( + "USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and somewhat comical situation of a young child reading a book while another child is attempting to read the same book. The child who is reading the book seems", # cuda output + "USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and somewhat comical situation of a young child reading a book while wearing a pair of glasses that are too large for them. The glasses are", # xpu output ) + self.assertTrue(self.processor.decode(output[0], skip_special_tokens=True) in EXPECTED_DECODED_TEXT) + @slow @require_bitsandbytes def test_small_model_integration_test_batch(self): diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py index 5238c29a9c..d597f8de71 100644 --- a/tests/quantization/autoawq/test_awq.py +++ b/tests/quantization/autoawq/test_awq.py @@ -59,16 +59,21 @@ class AwqConfigTest(unittest.TestCase): with self.assertRaises(ValueError): AwqConfig(bits=4, backend="unexisting-backend") - compute_capability = torch.cuda.get_device_capability() - major, minor = compute_capability + # Only cuda device can run this function + support_llm_awq = False + if torch.cuda.is_available(): + compute_capability = torch.cuda.get_device_capability() + major, minor = compute_capability + if major >= 8: + support_llm_awq = True - if major < 8: + if support_llm_awq: + # LLMAWQ should work on an A100 + AwqConfig(bits=4, backend="llm-awq") + else: # LLMAWQ does not work on a T4 with self.assertRaises(ValueError): AwqConfig(bits=4, backend="llm-awq") - else: - # LLMAWQ should work on an A100 - AwqConfig(bits=4, backend="llm-awq") def test_to_dict(self): """