enable more test cases on xpu (#38572)

* enable glm4 integration cases on XPU, set xpu expectation for blip2 Signed-off-by: Matrix YAO <matrix.yao@intel.com> * more Signed-off-by: YAO Matrix <matrix.yao@intel.com> * fix style Signed-off-by: YAO Matrix <matrix.yao@intel.com> * refine wording Signed-off-by: YAO Matrix <matrix.yao@intel.com> * refine test case names Signed-off-by: YAO Matrix <matrix.yao@intel.com> * run Signed-off-by: YAO Matrix <matrix.yao@intel.com> * add gemma2 and chameleon Signed-off-by: YAO Matrix <matrix.yao@intel.com> * fix review comments Signed-off-by: YAO Matrix <matrix.yao@intel.com> --------- Signed-off-by: Matrix YAO <matrix.yao@intel.com> Signed-off-by: YAO Matrix <matrix.yao@intel.com>
2025-06-06 15:29:51 +08:00
parent 31023b6909
commit 89542fb81c
23 changed files with 150 additions and 72 deletions
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -730,7 +730,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
        output_text = self.tokenizer.decode(output_parallel[0], skip_special_tokens=True)
        self.assertIn(output_text, self.EXPECTED_OUTPUTS)

-    def test_cpu_gpu_loading_random_device_map(self):
+    def test_cpu_accelerator_loading_random_device_map(self):
        r"""
        A test to check is dispatching a model on cpu & gpu works correctly using a random `device_map`.
        """
@@ -778,7 +778,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):

        self.check_inference_correctness(model_8bit)

-    def test_cpu_gpu_loading_custom_device_map(self):
+    def test_cpu_accelerator_loading_custom_device_map(self):
        r"""
        A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
        This time the device map is more organized than the test above and uses the abstraction
@@ -805,7 +805,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):

        self.check_inference_correctness(model_8bit)

-    def test_cpu_gpu_disk_loading_custom_device_map(self):
+    def test_cpu_accelerator_disk_loading_custom_device_map(self):
        r"""
        A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
        This time we also add `disk` on the device_map.
@@ -832,7 +832,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):

            self.check_inference_correctness(model_8bit)

-    def test_cpu_gpu_disk_loading_custom_device_map_kwargs(self):
+    def test_cpu_accelerator_disk_loading_custom_device_map_kwargs(self):
        r"""
        A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
        This time we also add `disk` on the device_map - using the kwargs directly instead of the quantization config
--- a/tests/quantization/ggml/test_ggml.py
+++ b/tests/quantization/ggml/test_ggml.py
@@ -20,7 +20,7 @@ from transformers import AddedToken, AutoModelForCausalLM, AutoModelForSeq2SeqLM
 from transformers.testing_utils import (
    require_gguf,
    require_read_token,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -35,7 +35,7 @@ if is_gguf_available():


@require_gguf
-@require_torch_gpu
+@require_torch_accelerator
@slow
 class GgufQuantizationTests(unittest.TestCase):
    """
@@ -107,7 +107,7 @@ class GgufQuantizationTests(unittest.TestCase):


@require_gguf
-@require_torch_gpu
+@require_torch_accelerator
@slow
 class GgufIntegrationTests(unittest.TestCase):
    """
@@ -263,7 +263,7 @@ class GgufIntegrationTests(unittest.TestCase):


@require_gguf
-@require_torch_gpu
+@require_torch_accelerator
@slow
 class GgufModelTests(unittest.TestCase):
    mistral_model_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
--- a/tests/quantization/quark_integration/test_quark.py
+++ b/tests/quantization/quark_integration/test_quark.py
@@ -11,17 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import gc
 import unittest

 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, QuarkConfig
 from transformers.testing_utils import (
+    cleanup,
    is_torch_available,
    require_accelerate,
    require_quark,
    require_torch_gpu,
    require_torch_multi_gpu,
    slow,
+    torch_device,
 )
 from transformers.utils.import_utils import is_quark_available

@@ -79,11 +80,10 @@ class QuarkTest(unittest.TestCase):

    def tearDown(self):
        r"""
-        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
+        TearDown function needs to be called at the end of each test to free the accelerator memory and cache, also to
        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
        """
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device, gc_collect=True)

    def test_memory_footprint(self):
        mem_quantized = self.quantized_model.get_memory_footprint()