enable generation fsdp/utils cases on XPU (#38009)
* enable generation fsdp/utils test cases on XPU Signed-off-by: Yao Matrix <matrix.yao@intel.com> * fix style Signed-off-by: Yao Matrix <matrix.yao@intel.com> * xx Signed-off-by: Yao Matrix <matrix.yao@intel.com> * use backend_xx APIs Signed-off-by: Yao Matrix <matrix.yao@intel.com> * fix style Signed-off-by: Yao Matrix <matrix.yao@intel.com> --------- Signed-off-by: Yao Matrix <matrix.yao@intel.com>
This commit is contained in:
@@ -41,7 +41,6 @@ from transformers.testing_utils import (
|
||||
require_torch_gpu,
|
||||
require_torch_greater_or_equal,
|
||||
require_torch_multi_accelerator,
|
||||
require_torch_multi_gpu,
|
||||
require_torch_sdpa,
|
||||
set_config_for_less_flaky_test,
|
||||
set_model_for_less_flaky_test,
|
||||
@@ -2954,7 +2953,7 @@ class GenerationIntegrationTests(unittest.TestCase):
|
||||
def test_stop_sequence_stopping_criteria(self):
|
||||
prompt = """Hello I believe in"""
|
||||
generator = pipeline("text-generation", model="hf-internal-testing/tiny-random-bart")
|
||||
output = generator(prompt)
|
||||
output = generator(prompt, max_new_tokens=10)
|
||||
self.assertEqual(
|
||||
output,
|
||||
[{"generated_text": ("Hello I believe in we we we we we we we we we")}],
|
||||
@@ -3860,7 +3859,7 @@ class GenerationIntegrationTests(unittest.TestCase):
|
||||
|
||||
@slow
|
||||
@require_torch_multi_accelerator
|
||||
def test_assisted_decoding_in_different_gpu(self):
|
||||
def test_assisted_decoding_in_different_accelerator(self):
|
||||
device_0 = f"{torch_device}:0" if torch_device != "cpu" else "cpu"
|
||||
device_1 = f"{torch_device}:1" if torch_device != "cpu" else "cpu"
|
||||
model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to(device_0)
|
||||
@@ -3885,7 +3884,7 @@ class GenerationIntegrationTests(unittest.TestCase):
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
def test_assisted_decoding_model_in_gpu_assistant_in_cpu(self):
|
||||
def test_assisted_decoding_model_in_accelerator_assistant_in_cpu(self):
|
||||
model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to(
|
||||
torch_device
|
||||
)
|
||||
@@ -3970,10 +3969,10 @@ class GenerationIntegrationTests(unittest.TestCase):
|
||||
self.assertTrue((expected_out == predicted_out).all().item())
|
||||
|
||||
@pytest.mark.generate
|
||||
@require_torch_multi_gpu
|
||||
def test_generate_with_static_cache_multi_gpu(self):
|
||||
@require_torch_multi_accelerator
|
||||
def test_generate_with_static_cache_multi_accelerator(self):
|
||||
"""
|
||||
Tests if the static cache has been set correctly and if generate works correctly when we are using multi-gpus.
|
||||
Tests if the static cache has been set correctly and if generate works correctly when we are using multi-acceleratorss.
|
||||
"""
|
||||
# need to split manually as auto doesn't work well with unbalanced model
|
||||
device_map = {"model.embed_tokens": 0, "model.layers.0": 0, "model.layers.1": 1, "model.norm": 1, "lm_head": 0}
|
||||
@@ -4005,10 +4004,10 @@ class GenerationIntegrationTests(unittest.TestCase):
|
||||
self.assertTrue(key_cache_1.device == value_cache_1.device == torch.device(1))
|
||||
|
||||
@pytest.mark.generate
|
||||
@require_torch_multi_gpu
|
||||
def test_generate_multi_gpu_causal_mask(self):
|
||||
@require_torch_multi_accelerator
|
||||
def test_generate_multi_accelerator_causal_mask(self):
|
||||
"""
|
||||
Tests that cache position device doesn't clash with causal mask device when we are using multi-gpus.
|
||||
Tests that cache position device doesn't clash with causal mask device when we are using multi-accelerators.
|
||||
In real life happens only when multimodal encoder size is big, so `embed_tokens` gets allocated to the next device.
|
||||
The error will be triggered whenever a bacthed input is used, so that `causal_mask` is actually prepared instead of
|
||||
being `None`.
|
||||
@@ -4033,10 +4032,10 @@ class GenerationIntegrationTests(unittest.TestCase):
|
||||
_ = model.generate(**inputs, max_new_tokens=20)
|
||||
|
||||
@pytest.mark.generate
|
||||
@require_torch_multi_gpu
|
||||
def test_init_static_cache_multi_gpu(self):
|
||||
@require_torch_multi_accelerator
|
||||
def test_init_static_cache_multi_accelerator(self):
|
||||
"""
|
||||
Tests if the static cache has been set correctly when we initialize it manually in a multi-gpu setup.
|
||||
Tests if the static cache has been set correctly when we initialize it manually in a multi-accelerator setup.
|
||||
"""
|
||||
# need to split manually as auto doesn't work well with unbalanced model
|
||||
device_map = {"model.embed_tokens": 0, "model.layers.0": 0, "model.layers.1": 1, "model.norm": 1, "lm_head": 0}
|
||||
@@ -4870,7 +4869,7 @@ class GenerationIntegrationTests(unittest.TestCase):
|
||||
|
||||
@require_read_token
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
@require_torch_accelerator
|
||||
def test_cache_device_map_with_vision_layer_device_map(self):
|
||||
"""
|
||||
Test that the cache device map is correctly set when the vision layer has a device map. Regression test for
|
||||
|
||||
Reference in New Issue
Block a user