VLM: fixes after refactor (#32907)
* leave only half of the changes * fix tests * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava * fix tests, first try * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava * fix, second try * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava * fix * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava
This commit is contained in:
committed by
GitHub
parent
f24f084329
commit
7d2d6ce9cb
@@ -302,7 +302,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
|
||||
EXPECTED_DECODED_TEXT = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Lastly, be respectful of the environment and other visitors, as the pier is a shared space where people can enjoy the view, relax, or engage in recreational activities." # fmt: skip
|
||||
EXPECTED_DECODED_TEXT = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Finally, be respectful of the environment and other visitors, and follow any posted rules or guidelines for the area." # fmt: skip
|
||||
|
||||
self.assertEqual(
|
||||
processor.decode(output[0], skip_special_tokens=True),
|
||||
@@ -353,7 +353,10 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=20)
|
||||
|
||||
EXPECTED_DECODED_TEXT = ['USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring along', 'USER: \nWhat is this?\nASSISTANT: Cats'] # fmt: skip
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
'USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring.',
|
||||
'USER: \nWhat is this?\nASSISTANT: Cats'
|
||||
] # fmt: skip
|
||||
self.assertEqual(
|
||||
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
@@ -393,7 +396,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_batched_generation(self):
|
||||
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf").to(torch_device)
|
||||
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True)
|
||||
|
||||
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
|
||||
|
||||
@@ -415,9 +418,9 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
model = model.eval()
|
||||
|
||||
EXPECTED_OUTPUT = [
|
||||
"\n \nUSER: What's the the difference of two images?\nASSISTANT: In the two images, the primary difference is the presence of a small dog in one and a ll",
|
||||
"\nUSER: Describe the image.\nASSISTANT: The image features a small, fluffy dog sitting on a sidewalk. The dog is holding",
|
||||
"\nUSER: Describe the image.\nASSISTANT: The image features a lone, adult llama standing on a grassy hill. The llama",
|
||||
"\n \nUSER: What's the the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
|
||||
"\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small",
|
||||
"\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the",
|
||||
]
|
||||
|
||||
generate_ids = model.generate(**inputs, max_new_tokens=20)
|
||||
@@ -451,26 +454,23 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
def test_llava_merge_inputs_error_bug(self):
|
||||
# This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
|
||||
model_id = "llava-hf/llava-1.5-7b-hf"
|
||||
model = LlavaForConditionalGeneration.from_pretrained(
|
||||
model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True
|
||||
).to(torch_device)
|
||||
model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
||||
|
||||
# Simulate some user inputs
|
||||
pixel_values = torch.randn(
|
||||
(2, 3, 336, 336),
|
||||
(1, 3, 336, 336),
|
||||
dtype=torch.float,
|
||||
device=torch_device,
|
||||
)
|
||||
input_ids = torch.tensor(
|
||||
[
|
||||
[32001, 32001, 1, 15043, 7084, 32000, 29871, 13, 7900],
|
||||
[1, 15043, 7084, 29901, 29871, 32000, 29871, 13, 7900],
|
||||
],
|
||||
dtype=torch.long,
|
||||
device=torch_device,
|
||||
)
|
||||
attention_mask = torch.tensor(
|
||||
[[0, 0, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]],
|
||||
[[0, 0, 1, 1, 1, 1, 1, 1, 1]],
|
||||
dtype=torch.long,
|
||||
device=torch_device,
|
||||
)
|
||||
@@ -515,6 +515,31 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
# Make sure that `generate` works
|
||||
_ = model.generate(**inputs, max_new_tokens=20)
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_generation_siglip_backbone(self):
|
||||
model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
|
||||
model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype="float16", device_map=torch_device)
|
||||
processor = AutoProcessor.from_pretrained(model_id)
|
||||
|
||||
# check processing with expansion of inputs (w/o expansion should work with any backbone)
|
||||
processor.vision_feature_select_strategy = "default"
|
||||
processor.patch_size = 14
|
||||
|
||||
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
||||
inputs = processor(
|
||||
text="<|im_start|>user\n<image>\nWhat are these?<|im_end|>\n<|im_start|>assistant",
|
||||
images=raw_image,
|
||||
return_tensors="pt",
|
||||
).to(torch_device, torch.float16)
|
||||
|
||||
# Make sure that `generate` works
|
||||
output = model.generate(**inputs, max_new_tokens=30)
|
||||
|
||||
EXPECTED_DECODED_TEXT = "user\n\nWhat are these?\nassistant The image shows two cats, one on the left and one on the right. They appear to be resting or sleeping on a pink blanket. The cat"
|
||||
self.assertTrue(processor.batch_decode(output, skip_special_tokens=True)[0] == EXPECTED_DECODED_TEXT)
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_expansion_in_processing(self):
|
||||
|
||||
Reference in New Issue
Block a user