Fix Gemma3IntegrationTest (#38471)

* check

* check

* check

* check

* check

* check

* check

* test style bot

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Yih-Dar
2025-05-29 16:51:12 +02:00
committed by GitHub
parent e508965df7
commit 81cff7ad34

View File

@@ -28,11 +28,14 @@ from transformers import (
is_torch_available, is_torch_available,
) )
from transformers.testing_utils import ( from transformers.testing_utils import (
Expectations,
cleanup, cleanup,
is_flash_attn_2_available,
require_flash_attn, require_flash_attn,
require_read_token, require_read_token,
require_torch, require_torch,
require_torch_gpu, require_torch_accelerator,
require_torch_large_accelerator,
slow, slow,
torch_device, torch_device,
) )
@@ -368,7 +371,7 @@ class Gemma3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unitte
@slow @slow
@require_torch_gpu @require_torch_accelerator
@require_read_token @require_read_token
class Gemma3IntegrationTest(unittest.TestCase): class Gemma3IntegrationTest(unittest.TestCase):
def setUp(self): def setUp(self):
@@ -407,9 +410,16 @@ class Gemma3IntegrationTest(unittest.TestCase):
output = model.generate(**inputs, max_new_tokens=30, do_sample=False) output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
output_text = self.processor.batch_decode(output, skip_special_tokens=True) output_text = self.processor.batch_decode(output, skip_special_tokens=True)
EXPECTED_TEXTS = ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like'] # fmt: skip EXPECTED_TEXTS = Expectations(
self.assertEqual(output_text, EXPECTED_TEXTS) {
("cuda", 7): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach with turquoise water in the background. It looks like a lovely,'],
("cuda", 8): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. It looks like a very sunny and'],
}
) # fmt: skip
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
self.assertEqual(output_text, EXPECTED_TEXT)
@require_torch_large_accelerator
def test_model_4b_batch(self): def test_model_4b_batch(self):
model_id = "google/gemma-3-4b-it" model_id = "google/gemma-3-4b-it"
@@ -444,12 +454,20 @@ class Gemma3IntegrationTest(unittest.TestCase):
output = model.generate(**inputs, max_new_tokens=30, do_sample=False) output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
output_text = self.processor.batch_decode(output, skip_special_tokens=True) output_text = self.processor.batch_decode(output, skip_special_tokens=True)
EXPECTED_TEXTS = [ EXPECTED_TEXTS = Expectations(
'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like', {
"user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Image 1:** Shows a cow" ("cuda", 7): [],
] # fmt: skip ("cuda", 8):
self.assertEqual(output_text, EXPECTED_TEXTS) [
'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. It looks like a very sunny and',
'user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. They depict very different scenes:\n\n* **Image 1** shows a cow standing on a beach.',
]
}
) # fmt: skip
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
self.assertEqual(output_text, EXPECTED_TEXT)
@require_torch_large_accelerator
def test_model_4b_crops(self): def test_model_4b_crops(self):
model_id = "google/gemma-3-4b-it" model_id = "google/gemma-3-4b-it"
@@ -479,10 +497,17 @@ class Gemma3IntegrationTest(unittest.TestCase):
output_text = self.processor.batch_decode(output, skip_special_tokens=True) output_text = self.processor.batch_decode(output, skip_special_tokens=True)
EXPECTED_NUM_IMAGES = 3 # one for the origin image and two crops of images EXPECTED_NUM_IMAGES = 3 # one for the origin image and two crops of images
EXPECTED_TEXTS = ['user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a beach with a turquoise ocean and blue sky in the background. It looks like the cow is enjoying the beach'] # fmt: skip EXPECTED_TEXTS = Expectations(
{
("cuda", 7): [],
("cuda", 8): ['user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.']
}
) # fmt: skip
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES) self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES)
self.assertEqual(output_text, EXPECTED_TEXTS) self.assertEqual(output_text, EXPECTED_TEXT)
@require_torch_large_accelerator
def test_model_4b_batch_crops(self): def test_model_4b_batch_crops(self):
model_id = "google/gemma-3-4b-it" model_id = "google/gemma-3-4b-it"
@@ -525,13 +550,20 @@ class Gemma3IntegrationTest(unittest.TestCase):
output = model.generate(**inputs, max_new_tokens=30, do_sample=False) output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
output_text = self.processor.batch_decode(output, skip_special_tokens=True) output_text = self.processor.batch_decode(output, skip_special_tokens=True)
EXPECTED_NUM_IMAGES = 9 # 3 * (one for the origin image and two crops of images) = 9 EXPECTED_NUM_IMAGES = 9 # 3 * (one for the origin image and two crops of images) = 9
EXPECTED_TEXTS = [ EXPECTED_TEXTS = Expectations(
"user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a beach with a turquoise ocean and blue sky in the background. It looks like the cow is enjoying the beach", {
"user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nWhile they all feature a brown cow in the foreground and a similar background (including the stop signs and", ("cuda", 7): [],
] # fmt: skip ("cuda", 8): [
'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.',
'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nThe first image shows a cow on a beach, while the second image shows a street scene with a',
]
}
) # fmt: skip
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES) self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES)
self.assertEqual(output_text, EXPECTED_TEXTS) self.assertEqual(output_text, EXPECTED_TEXT)
@require_torch_large_accelerator
def test_model_4b_multiimage(self): def test_model_4b_multiimage(self):
model_id = "google/gemma-3-4b-it" model_id = "google/gemma-3-4b-it"
@@ -561,9 +593,14 @@ class Gemma3IntegrationTest(unittest.TestCase):
output = model.generate(**inputs, max_new_tokens=30, do_sample=False) output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
output_text = self.processor.batch_decode(output, skip_special_tokens=True) output_text = self.processor.batch_decode(output, skip_special_tokens=True)
EXPECTED_TEXTS = Expectations(
EXPECTED_TEXTS = ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt looks like a street scene in a vibrant,"] # fmt: skip {
self.assertEqual(output_text, EXPECTED_TEXTS) ("cuda", 7): [],
("cuda", 8): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Main Features:**\n\n* **Chinese Archway:** The most prominent"]
}
) # fmt: skip
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
self.assertEqual(output_text, EXPECTED_TEXT)
def test_model_1b_text_only(self): def test_model_1b_text_only(self):
model_id = "google/gemma-3-1b-it" model_id = "google/gemma-3-1b-it"
@@ -577,12 +614,18 @@ class Gemma3IntegrationTest(unittest.TestCase):
output = model.generate(**inputs, max_new_tokens=30, do_sample=False) output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=True) output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
EXPECTED_TEXTS = ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a river deep,\nWith patterns hidden, secrets sleep.\nA neural net, a watchful eye,\nLearning'] # fmt: skip EXPECTED_TEXTS = Expectations(
self.assertEqual(output_text, EXPECTED_TEXTS) {
("cuda", 7): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a silent stream,\nInto the neural net, a waking dream.\nAlgorithms hum, a coded grace,\n'],
("cuda", 8): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a silent stream,\nInto the neural net, a waking dream.\nAlgorithms hum, a coded grace,\n'],
}
) # fmt: skip
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
self.assertEqual(output_text, EXPECTED_TEXT)
# TODO: raushan FA2 generates gibberish for no reason, check later # TODO: raushan FA2 generates gibberish for no reason, check later
@require_flash_attn @require_flash_attn
@require_torch_gpu @require_torch_large_accelerator
@pytest.mark.flash_attn_test @pytest.mark.flash_attn_test
def test_model_4b_flash_attn(self): def test_model_4b_flash_attn(self):
model_id = "google/gemma-3-4b-it" model_id = "google/gemma-3-4b-it"
@@ -602,8 +645,14 @@ class Gemma3IntegrationTest(unittest.TestCase):
output = model.generate(**inputs, max_new_tokens=30, do_sample=False) output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
output_text = self.processor.batch_decode(output, skip_special_tokens=True) output_text = self.processor.batch_decode(output, skip_special_tokens=True)
EXPECTED_TEXTS = ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. It looks like a very sunny and'] # fmt: skip EXPECTED_TEXTS = Expectations(
self.assertEqual(output_text, EXPECTED_TEXTS) {
("cuda", 7): [],
("cuda", 8): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach with turquoise water and a distant island in the background. It looks like a sunny day'],
}
) # fmt: skip
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
self.assertEqual(output_text, EXPECTED_TEXT)
@parameterized.expand([("flash_attention_2",), ("sdpa",), ("eager",)]) @parameterized.expand([("flash_attention_2",), ("sdpa",), ("eager",)])
def test_generation_beyond_sliding_window(self, attn_implementation: str): def test_generation_beyond_sliding_window(self, attn_implementation: str):
@@ -613,6 +662,9 @@ class Gemma3IntegrationTest(unittest.TestCase):
""" """
model_id = "google/gemma-3-1b-it" model_id = "google/gemma-3-1b-it"
if attn_implementation == "flash_attention_2" and not is_flash_attn_2_available():
self.skipTest("FlashAttention2 is required for this test.")
input_text = [ input_text = [
"This is a nice place. " * 800 + "I really enjoy the scenery,", # This is larger than 4096 tokens "This is a nice place. " * 800 + "I really enjoy the scenery,", # This is larger than 4096 tokens
"A list of colors: red, blue", # This will almost all be padding tokens "A list of colors: red, blue", # This will almost all be padding tokens