enable more test cases on xpu (#38572)

* enable glm4 integration cases on XPU, set xpu expectation for blip2

Signed-off-by: Matrix YAO <matrix.yao@intel.com>

* more

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

* fix style

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

* refine wording

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

* refine test case names

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

* run

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

* add gemma2 and chameleon

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

* fix review comments

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

---------

Signed-off-by: Matrix YAO <matrix.yao@intel.com>
Signed-off-by: YAO Matrix <matrix.yao@intel.com>
This commit is contained in:
Yao Matrix
2025-06-06 15:29:51 +08:00
committed by GitHub
parent 31023b6909
commit 89542fb81c
23 changed files with 150 additions and 72 deletions

View File

@@ -730,7 +730,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
output_text = self.tokenizer.decode(output_parallel[0], skip_special_tokens=True)
self.assertIn(output_text, self.EXPECTED_OUTPUTS)
def test_cpu_gpu_loading_random_device_map(self):
def test_cpu_accelerator_loading_random_device_map(self):
r"""
A test to check is dispatching a model on cpu & gpu works correctly using a random `device_map`.
"""
@@ -778,7 +778,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
self.check_inference_correctness(model_8bit)
def test_cpu_gpu_loading_custom_device_map(self):
def test_cpu_accelerator_loading_custom_device_map(self):
r"""
A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
This time the device map is more organized than the test above and uses the abstraction
@@ -805,7 +805,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
self.check_inference_correctness(model_8bit)
def test_cpu_gpu_disk_loading_custom_device_map(self):
def test_cpu_accelerator_disk_loading_custom_device_map(self):
r"""
A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
This time we also add `disk` on the device_map.
@@ -832,7 +832,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
self.check_inference_correctness(model_8bit)
def test_cpu_gpu_disk_loading_custom_device_map_kwargs(self):
def test_cpu_accelerator_disk_loading_custom_device_map_kwargs(self):
r"""
A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
This time we also add `disk` on the device_map - using the kwargs directly instead of the quantization config

View File

@@ -20,7 +20,7 @@ from transformers import AddedToken, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from transformers.testing_utils import (
require_gguf,
require_read_token,
require_torch_gpu,
require_torch_accelerator,
slow,
torch_device,
)
@@ -35,7 +35,7 @@ if is_gguf_available():
@require_gguf
@require_torch_gpu
@require_torch_accelerator
@slow
class GgufQuantizationTests(unittest.TestCase):
"""
@@ -107,7 +107,7 @@ class GgufQuantizationTests(unittest.TestCase):
@require_gguf
@require_torch_gpu
@require_torch_accelerator
@slow
class GgufIntegrationTests(unittest.TestCase):
"""
@@ -263,7 +263,7 @@ class GgufIntegrationTests(unittest.TestCase):
@require_gguf
@require_torch_gpu
@require_torch_accelerator
@slow
class GgufModelTests(unittest.TestCase):
mistral_model_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"

View File

@@ -11,17 +11,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, QuarkConfig
from transformers.testing_utils import (
cleanup,
is_torch_available,
require_accelerate,
require_quark,
require_torch_gpu,
require_torch_multi_gpu,
slow,
torch_device,
)
from transformers.utils.import_utils import is_quark_available
@@ -79,11 +80,10 @@ class QuarkTest(unittest.TestCase):
def tearDown(self):
r"""
TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
TearDown function needs to be called at the end of each test to free the accelerator memory and cache, also to
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
"""
gc.collect()
torch.cuda.empty_cache()
cleanup(torch_device, gc_collect=True)
def test_memory_footprint(self):
mem_quantized = self.quantized_model.get_memory_footprint()