From 038f8fc159355b6bae1a554d592df94cb61f2753 Mon Sep 17 00:00:00 2001 From: Yao Matrix Date: Wed, 7 May 2025 17:16:38 +0800 Subject: [PATCH] make aya vision 5 integration tests pass on xpu (#37990) * 5 aya vision integration pass on XPU Signed-off-by: Yao Matrix * fix style Signed-off-by: Yao Matrix --------- Signed-off-by: Yao Matrix Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> --- .../aya_vision/test_modeling_aya_vision.py | 52 +++++++++++++++---- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/tests/models/aya_vision/test_modeling_aya_vision.py b/tests/models/aya_vision/test_modeling_aya_vision.py index 5858321ee4..c35058abd6 100644 --- a/tests/models/aya_vision/test_modeling_aya_vision.py +++ b/tests/models/aya_vision/test_modeling_aya_vision.py @@ -25,10 +25,12 @@ from transformers import ( is_vision_available, ) from transformers.testing_utils import ( + Expectations, cleanup, + require_deterministic_for_xpu, require_read_token, require_torch, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -332,7 +334,7 @@ class AyaVisionIntegrationTest(unittest.TestCase): cleanup(torch_device, gc_collect=True) @slow - @require_torch_gpu + @require_torch_accelerator def test_small_model_integration_forward(self): processor = AutoProcessor.from_pretrained(self.model_checkpoint) model = AyaVisionForConditionalGeneration.from_pretrained( @@ -366,7 +368,8 @@ class AyaVisionIntegrationTest(unittest.TestCase): ) @slow - @require_torch_gpu + @require_torch_accelerator + @require_deterministic_for_xpu def test_small_model_integration_generate_text_only(self): processor = AutoProcessor.from_pretrained(self.model_checkpoint) model = AyaVisionForConditionalGeneration.from_pretrained( @@ -390,11 +393,19 @@ class AyaVisionIntegrationTest(unittest.TestCase): generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True ) print("decoded_output", decoded_output) - expected_output = "Whispers on the breeze,\nLeaves dance under moonlit skies,\nNature's quiet song." + + expected_outputs = Expectations( + { + ("xpu", 3): "Whispers on the breeze,\nLeaves dance under moonlit sky,\nNature's quiet song.", + ("cuda", 7): "Whispers on the breeze,\nLeaves dance under moonlit skies,\nNature's quiet song.", + } + ) # fmt: skip + expected_output = expected_outputs.get_expectation() + self.assertEqual(decoded_output, expected_output) @slow - @require_torch_gpu + @require_torch_accelerator def test_small_model_integration_generate_chat_template(self): processor = AutoProcessor.from_pretrained(self.model_checkpoint) model = AyaVisionForConditionalGeneration.from_pretrained( @@ -423,7 +434,7 @@ class AyaVisionIntegrationTest(unittest.TestCase): self.assertEqual(decoded_output, expected_output) @slow - @require_torch_gpu + @require_torch_accelerator def test_small_model_integration_batched_generate(self): processor = AutoProcessor.from_pretrained(self.model_checkpoint) model = AyaVisionForConditionalGeneration.from_pretrained( @@ -459,7 +470,14 @@ class AyaVisionIntegrationTest(unittest.TestCase): # Check first output decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True) print("decoded_output", decoded_output) - expected_output = "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene." # fmt: skip + expected_outputs = Expectations( + { + ("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.", + ("cuda", 7): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.", + } + ) # fmt: skip + expected_output = expected_outputs.get_expectation() + self.assertEqual( decoded_output, expected_output, @@ -478,7 +496,8 @@ class AyaVisionIntegrationTest(unittest.TestCase): ) @slow - @require_torch_gpu + @require_torch_accelerator + @require_deterministic_for_xpu def test_small_model_integration_batched_generate_multi_image(self): processor = AutoProcessor.from_pretrained(self.model_checkpoint) model = AyaVisionForConditionalGeneration.from_pretrained( @@ -523,7 +542,14 @@ class AyaVisionIntegrationTest(unittest.TestCase): # Check first output decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True) # Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232 - expected_output = "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene." # fmt: skip + expected_outputs = Expectations( + { + ("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.", + ("cuda", 7): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.", + } + ) # fmt: skip + expected_output = expected_outputs.get_expectation() + print("decoded_output", decoded_output) self.assertEqual( decoded_output, @@ -534,7 +560,13 @@ class AyaVisionIntegrationTest(unittest.TestCase): # Check second output decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True) print("decoded_output", decoded_output) - expected_output = "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at a" # fmt: skip + expected_outputs = Expectations( + { + ("xpu", 3): "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ", + ("cuda", 7): "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at a", + } + ) # fmt: skip + expected_output = expected_outputs.get_expectation() self.assertEqual( decoded_output, expected_output,