enable more test cases on xpu (#38572)
* enable glm4 integration cases on XPU, set xpu expectation for blip2 Signed-off-by: Matrix YAO <matrix.yao@intel.com> * more Signed-off-by: YAO Matrix <matrix.yao@intel.com> * fix style Signed-off-by: YAO Matrix <matrix.yao@intel.com> * refine wording Signed-off-by: YAO Matrix <matrix.yao@intel.com> * refine test case names Signed-off-by: YAO Matrix <matrix.yao@intel.com> * run Signed-off-by: YAO Matrix <matrix.yao@intel.com> * add gemma2 and chameleon Signed-off-by: YAO Matrix <matrix.yao@intel.com> * fix review comments Signed-off-by: YAO Matrix <matrix.yao@intel.com> --------- Signed-off-by: Matrix YAO <matrix.yao@intel.com> Signed-off-by: YAO Matrix <matrix.yao@intel.com>
This commit is contained in:
@@ -730,7 +730,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
|
||||
output_text = self.tokenizer.decode(output_parallel[0], skip_special_tokens=True)
|
||||
self.assertIn(output_text, self.EXPECTED_OUTPUTS)
|
||||
|
||||
def test_cpu_gpu_loading_random_device_map(self):
|
||||
def test_cpu_accelerator_loading_random_device_map(self):
|
||||
r"""
|
||||
A test to check is dispatching a model on cpu & gpu works correctly using a random `device_map`.
|
||||
"""
|
||||
@@ -778,7 +778,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
|
||||
|
||||
self.check_inference_correctness(model_8bit)
|
||||
|
||||
def test_cpu_gpu_loading_custom_device_map(self):
|
||||
def test_cpu_accelerator_loading_custom_device_map(self):
|
||||
r"""
|
||||
A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
|
||||
This time the device map is more organized than the test above and uses the abstraction
|
||||
@@ -805,7 +805,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
|
||||
|
||||
self.check_inference_correctness(model_8bit)
|
||||
|
||||
def test_cpu_gpu_disk_loading_custom_device_map(self):
|
||||
def test_cpu_accelerator_disk_loading_custom_device_map(self):
|
||||
r"""
|
||||
A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
|
||||
This time we also add `disk` on the device_map.
|
||||
@@ -832,7 +832,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
|
||||
|
||||
self.check_inference_correctness(model_8bit)
|
||||
|
||||
def test_cpu_gpu_disk_loading_custom_device_map_kwargs(self):
|
||||
def test_cpu_accelerator_disk_loading_custom_device_map_kwargs(self):
|
||||
r"""
|
||||
A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
|
||||
This time we also add `disk` on the device_map - using the kwargs directly instead of the quantization config
|
||||
|
||||
@@ -20,7 +20,7 @@ from transformers import AddedToken, AutoModelForCausalLM, AutoModelForSeq2SeqLM
|
||||
from transformers.testing_utils import (
|
||||
require_gguf,
|
||||
require_read_token,
|
||||
require_torch_gpu,
|
||||
require_torch_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
@@ -35,7 +35,7 @@ if is_gguf_available():
|
||||
|
||||
|
||||
@require_gguf
|
||||
@require_torch_gpu
|
||||
@require_torch_accelerator
|
||||
@slow
|
||||
class GgufQuantizationTests(unittest.TestCase):
|
||||
"""
|
||||
@@ -107,7 +107,7 @@ class GgufQuantizationTests(unittest.TestCase):
|
||||
|
||||
|
||||
@require_gguf
|
||||
@require_torch_gpu
|
||||
@require_torch_accelerator
|
||||
@slow
|
||||
class GgufIntegrationTests(unittest.TestCase):
|
||||
"""
|
||||
@@ -263,7 +263,7 @@ class GgufIntegrationTests(unittest.TestCase):
|
||||
|
||||
|
||||
@require_gguf
|
||||
@require_torch_gpu
|
||||
@require_torch_accelerator
|
||||
@slow
|
||||
class GgufModelTests(unittest.TestCase):
|
||||
mistral_model_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
|
||||
|
||||
@@ -11,17 +11,18 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, QuarkConfig
|
||||
from transformers.testing_utils import (
|
||||
cleanup,
|
||||
is_torch_available,
|
||||
require_accelerate,
|
||||
require_quark,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils.import_utils import is_quark_available
|
||||
|
||||
@@ -79,11 +80,10 @@ class QuarkTest(unittest.TestCase):
|
||||
|
||||
def tearDown(self):
|
||||
r"""
|
||||
TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
|
||||
TearDown function needs to be called at the end of each test to free the accelerator memory and cache, also to
|
||||
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
|
||||
"""
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
cleanup(torch_device, gc_collect=True)
|
||||
|
||||
def test_memory_footprint(self):
|
||||
mem_quantized = self.quantized_model.get_memory_footprint()
|
||||
|
||||
Reference in New Issue
Block a user