Fix CI for VLMs (#35690)

* fix some easy test

* more tests

* remove logit check here also

* add require_torch_large_gpu in Emu3
This commit is contained in:
Raushan Turganbay
2025-01-20 11:15:39 +01:00
committed by GitHub
parent 5fa3534475
commit 8571bb145a
17 changed files with 102 additions and 485 deletions

View File

@@ -17,6 +17,7 @@
import unittest
import numpy as np
import pytest
import requests
from huggingface_hub import hf_hub_download
from parameterized import parameterized
@@ -25,6 +26,7 @@ from transformers import Emu3Config, Emu3TextConfig, is_torch_available, is_visi
from transformers.testing_utils import (
require_bitsandbytes,
require_torch,
require_torch_large_gpu,
slow,
torch_device,
)
@@ -394,48 +396,44 @@ class Emu3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
def test_initialization(self):
pass
@pytest.mark.generate
@unittest.skip("Emu3 has dynamic control flow in vision backbone")
def test_generate_with_static_cache(self):
pass
@require_torch
class Emu3IntegrationTest(unittest.TestCase):
@slow
@require_bitsandbytes
def test_model_generation(self):
model = Emu3ForConditionalGeneration.from_pretrained(
"Emu3-community/Emu3-Chat-hf", load_in_4bit=True, device_map="auto"
)
processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", load_in_4bit=True)
processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")
image = Image.open(
requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw
)
image = Image.open(requests.get("https://picsum.photos/id/237/200/200", stream=True).raw)
prompt = "USER: <image>Describe what do you see here and tell me about the history behind it? ASSISTANT:"
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, torch.float16)
# greedy generation outputs
EXPECTED_TEXT_COMPLETION = ['USER: 114*143Describe what do you see here and tell me about the history behind it? ASSISTANT: The image depicts the constellation of Ursa Minor, also known as the Little Bear. This constellation was one of the 24 modern constellations introduced by Charles Messier in 178'] # fmt: skip
EXPECTED_TEXT_COMPLETION = ['USER: 64*64Describe what do you see here and tell me about the history behind it? ASSISTANT: The image captures a moment of tranquility with a black Labrador Retriever resting on a wooden floor. The dog, with its glossy black coat, is lying down with its front legs stretched out in'] # fmt: skip
generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
@slow
@require_bitsandbytes
@require_torch_large_gpu
def test_model_generation_batched(self):
model = Emu3ForConditionalGeneration.from_pretrained(
"Emu3-community/Emu3-Chat-hf", load_in_4bit=True, device_map="auto"
)
processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", load_in_4bit=True)
processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")
processor.tokenizer.padding_side = "left"
image = Image.open(
requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw
)
image_2 = Image.open(
requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw
)
image = Image.open(requests.get("https://picsum.photos/id/237/50/50", stream=True).raw)
image_2 = Image.open(requests.get("https://picsum.photos/id/247/50/50", stream=True).raw)
prompts = [
"USER: <image>Describe what do you see here and tell me about the history behind it? ASSISTANT:",
"USER: <image>What do you know about the constellation in this image? ASSISTANT:",
"USER: <image>Describe what do you see here? ASSISTANT:",
"USER: <image>What can you say about the image? ASSISTANT:",
]
inputs = processor(images=[image, image_2], text=prompts, padding=True, return_tensors="pt").to(
@@ -444,52 +442,47 @@ class Emu3IntegrationTest(unittest.TestCase):
# greedy generation outputs
EXPECTED_TEXT_COMPLETION = [
'USER: 114*143Describe what do you see here and tell me about the history behind it? ASSISTANT: The image depicts the constellation of Ursa Minor, also known as the Little Bear. This constellation was one of the 24 modern constellations introduced by Charles Messier in 178',
'USER: 75*125What do you know about the constellation in this image? ASSISTANT: The image shows a segment of a wire rope, characterized by its consistent pattern and regular twists, indicative of a high-quality, well-made rope. This type of detail suggests careful manufacturing processes and attention to'
] # fmt: skip
"USER: 64*64Describe what do you see here? ASSISTANT: The image depicts a black panther in a crouched position. The panther's body is elongated and curved, with its head lowered and ears pointed forward, suggesting alertness or focus.",
'USER: 64*64What can you say about the image? ASSISTANT: The image depicts a serene natural landscape. The foreground consists of a grassy area with some patches of bare earth. The middle ground shows a steep, reddish-brown cliff, which could be a'
] # fmt: skip
generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
@slow
@require_bitsandbytes
@require_torch_large_gpu
def test_model_generation_multi_image(self):
model = Emu3ForConditionalGeneration.from_pretrained(
"Emu3-community/Emu3-Chat-hf", load_in_4bit=True, device_map="auto"
)
processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", load_in_4bit=True)
processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")
image = Image.open(
requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw
)
image_2 = Image.open(
requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw
)
image = Image.open(requests.get("https://picsum.photos/id/237/50/50", stream=True).raw)
image_2 = Image.open(requests.get("https://picsum.photos/id/247/50/50", stream=True).raw)
prompt = "USER: <image><image>What do these two images have in common? ASSISTANT:"
inputs = processor(images=[image, image_2], text=prompt, return_tensors="pt").to(model.device, torch.float16)
# greedy generation outputs
EXPECTED_TEXT_COMPLETION = ['USER: 114*14375*125What do these two images have in common? ASSISTANT: The two images both depict a geometric shape - a triangle in the larger image and a line segment in the smaller image. They share a common feature of being created with a series of connected dots, which'] # fmt: skip
EXPECTED_TEXT_COMPLETION = ["USER: 64*6464*64What do these two images have in common? ASSISTANT: Both images feature a black animal, but they are not the same animal. The top image shows a close-up of a black cow's head, while the bottom image depicts a black cow in a natural"] # fmt: skip
generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
@slow
@require_bitsandbytes
@require_torch_large_gpu
def test_model_generate_images(self):
model = Emu3ForConditionalGeneration.from_pretrained(
"Emu3-community/Emu3-Gen-hf", load_in_4bit=True, device_map="auto"
)
processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Gen-hf", load_in_4bit=True)
processor = Emu3Processor.from_pretrained("BAAI/Emu3-Gen-hf")
inputs = processor(
text=["a portrait of young girl. masterpiece, film grained, best quality."],
padding=True,
return_tensors="pt",
return_for_image_generation=True,
image_area=1600,
).to(model.device)
self.assertTrue(inputs.input_ids.shape[1] == 23)
self.assertTrue(inputs.input_ids.shape[1] == 21)
image_sizes = inputs.pop("image_sizes")
HEIGHT, WIDTH = image_sizes[0]
@@ -522,20 +515,20 @@ class Emu3IntegrationTest(unittest.TestCase):
out = model.generate(
**inputs,
max_new_tokens=50_000,
max_new_tokens=200,
prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
do_sample=False,
)
self.assertTrue(out.shape[1] == 8216)
self.assertTrue(out.shape[1] == 54)
image = model.decode_image_tokens(out[:, inputs.input_ids.shape[1] :], height=HEIGHT, width=WIDTH)
images = processor.postprocess(list(image.float()), return_tensors="np")
self.assertTrue(images["pixel_values"].shape == (3, 720, 720))
self.assertTrue(images["pixel_values"].shape == (3, 40, 40))
self.assertTrue(isinstance(images["pixel_values"], np.ndarray))
filepath = hf_hub_download(
repo_id="raushan-testing-hf/images_test",
filename="emu3_generated_pixels.npy",
filename="emu3_image.npy",
repo_type="dataset",
)
original_pixels = np.load(filepath)