VLM: fixes after refactor (#32907)
* leave only half of the changes * fix tests * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava * fix tests, first try * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava * fix, second try * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava * fix * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava
This commit is contained in:
committed by
GitHub
parent
f24f084329
commit
7d2d6ce9cb
@@ -302,7 +302,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
|
||||
EXPECTED_DECODED_TEXT = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Lastly, be respectful of the environment and other visitors, as the pier is a shared space where people can enjoy the view, relax, or engage in recreational activities." # fmt: skip
|
||||
EXPECTED_DECODED_TEXT = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Finally, be respectful of the environment and other visitors, and follow any posted rules or guidelines for the area." # fmt: skip
|
||||
|
||||
self.assertEqual(
|
||||
processor.decode(output[0], skip_special_tokens=True),
|
||||
@@ -353,7 +353,10 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=20)
|
||||
|
||||
EXPECTED_DECODED_TEXT = ['USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring along', 'USER: \nWhat is this?\nASSISTANT: Cats'] # fmt: skip
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
'USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring.',
|
||||
'USER: \nWhat is this?\nASSISTANT: Cats'
|
||||
] # fmt: skip
|
||||
self.assertEqual(
|
||||
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
@@ -393,7 +396,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_batched_generation(self):
|
||||
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf").to(torch_device)
|
||||
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True)
|
||||
|
||||
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
|
||||
|
||||
@@ -415,9 +418,9 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
model = model.eval()
|
||||
|
||||
EXPECTED_OUTPUT = [
|
||||
"\n \nUSER: What's the the difference of two images?\nASSISTANT: In the two images, the primary difference is the presence of a small dog in one and a ll",
|
||||
"\nUSER: Describe the image.\nASSISTANT: The image features a small, fluffy dog sitting on a sidewalk. The dog is holding",
|
||||
"\nUSER: Describe the image.\nASSISTANT: The image features a lone, adult llama standing on a grassy hill. The llama",
|
||||
"\n \nUSER: What's the the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
|
||||
"\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small",
|
||||
"\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the",
|
||||
]
|
||||
|
||||
generate_ids = model.generate(**inputs, max_new_tokens=20)
|
||||
@@ -451,26 +454,23 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
def test_llava_merge_inputs_error_bug(self):
|
||||
# This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
|
||||
model_id = "llava-hf/llava-1.5-7b-hf"
|
||||
model = LlavaForConditionalGeneration.from_pretrained(
|
||||
model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True
|
||||
).to(torch_device)
|
||||
model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
||||
|
||||
# Simulate some user inputs
|
||||
pixel_values = torch.randn(
|
||||
(2, 3, 336, 336),
|
||||
(1, 3, 336, 336),
|
||||
dtype=torch.float,
|
||||
device=torch_device,
|
||||
)
|
||||
input_ids = torch.tensor(
|
||||
[
|
||||
[32001, 32001, 1, 15043, 7084, 32000, 29871, 13, 7900],
|
||||
[1, 15043, 7084, 29901, 29871, 32000, 29871, 13, 7900],
|
||||
],
|
||||
dtype=torch.long,
|
||||
device=torch_device,
|
||||
)
|
||||
attention_mask = torch.tensor(
|
||||
[[0, 0, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]],
|
||||
[[0, 0, 1, 1, 1, 1, 1, 1, 1]],
|
||||
dtype=torch.long,
|
||||
device=torch_device,
|
||||
)
|
||||
@@ -515,6 +515,31 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
# Make sure that `generate` works
|
||||
_ = model.generate(**inputs, max_new_tokens=20)
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_generation_siglip_backbone(self):
|
||||
model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
|
||||
model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype="float16", device_map=torch_device)
|
||||
processor = AutoProcessor.from_pretrained(model_id)
|
||||
|
||||
# check processing with expansion of inputs (w/o expansion should work with any backbone)
|
||||
processor.vision_feature_select_strategy = "default"
|
||||
processor.patch_size = 14
|
||||
|
||||
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
||||
inputs = processor(
|
||||
text="<|im_start|>user\n<image>\nWhat are these?<|im_end|>\n<|im_start|>assistant",
|
||||
images=raw_image,
|
||||
return_tensors="pt",
|
||||
).to(torch_device, torch.float16)
|
||||
|
||||
# Make sure that `generate` works
|
||||
output = model.generate(**inputs, max_new_tokens=30)
|
||||
|
||||
EXPECTED_DECODED_TEXT = "user\n\nWhat are these?\nassistant The image shows two cats, one on the left and one on the right. They appear to be resting or sleeping on a pink blanket. The cat"
|
||||
self.assertTrue(processor.batch_decode(output, skip_special_tokens=True)[0] == EXPECTED_DECODED_TEXT)
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_expansion_in_processing(self):
|
||||
|
||||
@@ -363,11 +363,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
output = model(**inputs)
|
||||
|
||||
expected_slice = torch.tensor(
|
||||
[
|
||||
[-4.7695, -4.5664, -0.2786],
|
||||
[-10.6250, -10.8906, -2.5254],
|
||||
[-6.7383, -7.2461, -0.6787],
|
||||
],
|
||||
[[-4.7695, -4.5664, -0.2788], [-10.6172, -10.8828, -2.5273], [-6.7383, -7.2422, -0.6694]],
|
||||
dtype=torch.float32,
|
||||
device=torch_device,
|
||||
)
|
||||
@@ -471,16 +467,16 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
output = model(**inputs)
|
||||
|
||||
expected_slice = torch.tensor(
|
||||
[[-0.0308, -0.0313, -0.0314], [-0.3064, -0.3013, -0.2986], [-0.1226, -0.1246, -0.1210]],
|
||||
[[-0.1287, -0.1294, -0.1284], [-0.2744, -0.2698, -0.2671], [-0.1071, -0.1091, -0.1056]],
|
||||
dtype=torch.float32,
|
||||
device=torch_device,
|
||||
)
|
||||
assert torch.allclose(output.logits[0, -3:, -3:], expected_slice, atol=1e-3)
|
||||
assert torch.allclose(output.loss, torch.tensor(6.8619, device=torch_device))
|
||||
assert torch.allclose(output.loss, torch.tensor(7.0206, device=torch_device), atol=1e-3)
|
||||
|
||||
# verify generation
|
||||
output = model.generate(**inputs, max_new_tokens=50)
|
||||
EXPECTED_DECODED_TEXT = '[INST] \nWhat is shown in this image? [/INST] The image shows a forested area with a misty or foggy atmosphere. In the foreground, there is a grassy field with a few deer grazing. The deer are partially obscured by the fog, and the trees in the background' # fmt: skip
|
||||
EXPECTED_DECODED_TEXT = '[INST] \nWhat is shown in this image? [/INST] The image shows two deer, likely fawns, in a grassy area with trees in the background. The setting appears to be a forest or woodland, and the photo is taken during what seems to be either dawn or dusk, given' # fmt: skip
|
||||
self.assertEqual(
|
||||
self.processor.decode(output[0], skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
@@ -534,38 +530,66 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
|
||||
# model is in eval mode by default so we should get pad on the left side
|
||||
# we can check the first hidden-states (aka inputs embeds)
|
||||
# the first element was lo-res image and we expect the first 1414 tokens to be all pads
|
||||
output_eval = model(**inputs_batched, output_hidden_states=True)
|
||||
self.assertTrue((output_eval.hidden_states[0][0, :1414, ...] == 0).all().item())
|
||||
|
||||
# otherwise padding is on the right side, so it's last 1414 tokens
|
||||
self.processor.padding_side = "right"
|
||||
inputs_batched = self.processor(
|
||||
[self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True
|
||||
).to(torch_device)
|
||||
|
||||
model.train()
|
||||
# the first element was lo-res image and we expect the first 732 tokens to be all pads
|
||||
with torch.no_grad():
|
||||
output_train = model(**inputs_batched, output_hidden_states=True)
|
||||
self.assertTrue((output_train.hidden_states[0][0, -1414:, ...] == 0).all().item())
|
||||
output_eval = model(**inputs_batched, output_hidden_states=True)
|
||||
self.assertTrue((output_eval.hidden_states[0][0, :732, ...] == 0).all().item())
|
||||
|
||||
with self.assertLogs("transformers", level="WARNING") as logs:
|
||||
model.padding_side = "left"
|
||||
model.train()
|
||||
model(**inputs_batched, output_hidden_states=True)
|
||||
with torch.no_grad():
|
||||
model(**inputs_batched, output_hidden_states=True)
|
||||
|
||||
self.assertIn(
|
||||
"Padding side is set to 'left' but the model is in training mode. For training", logs.output[0]
|
||||
)
|
||||
self.assertIn("Padding side is set to 'left' but the model is in training mode. For training", logs)
|
||||
|
||||
with self.assertLogs("transformers", level="WARNING") as logs:
|
||||
model.padding_side = "right"
|
||||
model.eval()
|
||||
model(**inputs_batched, output_hidden_states=True)
|
||||
with torch.no_grad():
|
||||
model(**inputs_batched, output_hidden_states=True)
|
||||
|
||||
self.assertIn(
|
||||
"Padding side is set to 'right' but the model is in inference mode. For correct", logs.output[0]
|
||||
)
|
||||
self.assertIn("Padding side is set to 'right' but the model is in inference mode. For correct", logs)
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_expansion_in_processing_multiimage(self):
|
||||
model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
|
||||
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
||||
processor = AutoProcessor.from_pretrained(model_id)
|
||||
|
||||
prompt = "USER: <image><image>\nDescribe the similarity between the two images:\nASSISTANT:"
|
||||
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
||||
deer_image = Image.open(
|
||||
requests.get(
|
||||
"https://4.img-dpreview.com/files/p/TS560x560~forums/56876524/03975b28741443319e9a94615e35667e",
|
||||
stream=True,
|
||||
).raw
|
||||
)
|
||||
|
||||
# check processing with expansion of inputs
|
||||
processor.vision_feature_select_strategy = "default"
|
||||
processor.patch_size = 14
|
||||
inputs_expanded = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
|
||||
torch_device, torch.float16
|
||||
)
|
||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 3969)
|
||||
|
||||
# check processing without expansion of inputs (legacy behavior)
|
||||
processor.vision_feature_select_strategy = None
|
||||
processor.patch_size = None
|
||||
inputs = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
|
||||
torch_device, torch.float16
|
||||
)
|
||||
self.assertTrue(inputs.input_ids.shape[-1] == 23)
|
||||
|
||||
# generate exactly 20 tokens
|
||||
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
|
||||
output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
|
||||
|
||||
# check that both inputs are handled correctly and generate the same output
|
||||
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
|
||||
@@ -18,6 +18,7 @@ import gc
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
from transformers import (
|
||||
@@ -363,29 +364,6 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
)
|
||||
|
||||
inputs = self.processor(self.prompt_video, videos=self.video, return_tensors="pt")
|
||||
expected_input_ids = [
|
||||
1,
|
||||
3148,
|
||||
1001,
|
||||
29901,
|
||||
29871,
|
||||
32000,
|
||||
13,
|
||||
11008,
|
||||
338,
|
||||
445,
|
||||
4863,
|
||||
2090,
|
||||
1460,
|
||||
29973,
|
||||
319,
|
||||
1799,
|
||||
9047,
|
||||
13566,
|
||||
29901,
|
||||
]
|
||||
self.assertListEqual(expected_input_ids, inputs.input_ids[0].tolist())
|
||||
|
||||
# verify single forward pass
|
||||
inputs = inputs.to(torch_device)
|
||||
with torch.no_grad():
|
||||
@@ -393,7 +371,7 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
|
||||
# verify generation
|
||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=40)
|
||||
EXPECTED_DECODED_TEXT = 'USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and exaggerated reactions of the child to the book. The child appears to be reading a book, but instead of a calm and focused reading experience' # fmt: skip
|
||||
EXPECTED_DECODED_TEXT = 'USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and somewhat comical situation of a young child reading a book while another child is attempting to read the same book. The child who is reading the book seems' # fmt: skip
|
||||
|
||||
self.assertEqual(
|
||||
self.processor.decode(output[0], skip_special_tokens=True),
|
||||
@@ -416,7 +394,10 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
|
||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=20)
|
||||
|
||||
EXPECTED_DECODED_TEXT = ['USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and exaggerated reactions of the child to the', 'USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and exaggerated reactions of the child to the'] # fmt: skip
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
'USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and somewhat comical situation of a young child reading a',
|
||||
'USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and somewhat comical situation of a young child reading a'
|
||||
] # fmt: skip
|
||||
self.assertEqual(
|
||||
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
@@ -447,7 +428,7 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
|
||||
# verify generation
|
||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=50)
|
||||
EXPECTED_DECODED_TEXT = 'USER: \nWhat is shown in this image? ASSISTANT: The image appears to be a graphical representation of a benchmark test for a machine learning model. It shows the performance of various models on a task, with the x-axis representing the number of parameters (measured in millions) and the y' # fmt: skip
|
||||
EXPECTED_DECODED_TEXT = 'USER: \nWhat is shown in this image? ASSISTANT: The image appears to be a graphical representation of a machine learning model\'s performance on a task, likely related to natural language processing or text understanding. It shows a scatter plot with two axes, one labeled "BLIP-2"' # fmt: skip
|
||||
self.assertEqual(self.processor.decode(output[0], skip_special_tokens=True), EXPECTED_DECODED_TEXT)
|
||||
|
||||
@slow
|
||||
@@ -493,41 +474,25 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
# model is in eval mode by default so we should get pad on the left side
|
||||
# we can check the first hidden-states (aka inputs embeds)
|
||||
# the first element was lo-res image and we expect the first 1482 tokens to be all pads
|
||||
output_eval = model(**inputs_batched, output_hidden_states=True)
|
||||
self.assertTrue((output_eval.hidden_states[0][0, :1482, ...] == 0).all().item())
|
||||
|
||||
# otherwise padding is on the right side, so it's last 1482 tokens
|
||||
self.processor.padding_side = "right"
|
||||
inputs_batched = self.processor(
|
||||
[self.prompt_video, self.prompt_image],
|
||||
images=[self.image],
|
||||
videos=[self.video],
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
).to(torch_device)
|
||||
|
||||
model.train()
|
||||
with torch.no_grad():
|
||||
output_train = model(**inputs_batched, output_hidden_states=True)
|
||||
self.assertTrue((output_train.hidden_states[0][0, -1482:, ...] == 0).all().item())
|
||||
output_eval = model(**inputs_batched, output_hidden_states=True)
|
||||
self.assertTrue((output_eval.hidden_states[0][0, :1482, ...] == 0).all().item())
|
||||
|
||||
with self.assertLogs("transformers", level="WARNING") as logs:
|
||||
model.padding_side = "left"
|
||||
model.train()
|
||||
model(**inputs_batched, output_hidden_states=True)
|
||||
with torch.no_grad():
|
||||
model(**inputs_batched, output_hidden_states=True)
|
||||
|
||||
self.assertIn(
|
||||
"Padding side is set to 'left' but the model is in training mode. For training", logs.output[0]
|
||||
)
|
||||
self.assertIn("Padding side is set to 'left' but the model is in training mode. For training", logs)
|
||||
|
||||
with self.assertLogs("transformers", level="WARNING") as logs:
|
||||
model.padding_side = "right"
|
||||
model.eval()
|
||||
model(**inputs_batched, output_hidden_states=True)
|
||||
with torch.no_grad():
|
||||
model(**inputs_batched, output_hidden_states=True)
|
||||
|
||||
self.assertIn(
|
||||
"Padding side is set to 'right' but the model is in inference mode. For correct", logs.output[0]
|
||||
)
|
||||
self.assertIn("Padding side is set to 'right' but the model is in inference mode. For correct", logs)
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
@@ -556,3 +521,73 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
|
||||
# check that both inputs are handled correctly and generate the same output
|
||||
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_expansion_in_processing_images(self):
|
||||
model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
|
||||
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
|
||||
"llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True
|
||||
)
|
||||
processor = AutoProcessor.from_pretrained(model_id)
|
||||
|
||||
# check processing with expansion of inputs
|
||||
processor.vision_feature_select_strategy = "default"
|
||||
processor.patch_size = 14
|
||||
inputs_expanded = processor(self.prompt_image, images=[self.image], return_tensors="pt").to(torch_device)
|
||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2652)
|
||||
|
||||
# check processing without expansion of inputs (legacy behavior)
|
||||
processor.vision_feature_select_strategy = None
|
||||
processor.patch_size = None
|
||||
inputs = processor(self.prompt_image, images=[self.image], return_tensors="pt").to(torch_device)
|
||||
self.assertTrue(inputs.input_ids.shape[-1] == 19)
|
||||
|
||||
# generate exactly 20 tokens
|
||||
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
|
||||
output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
|
||||
|
||||
# check that both inputs are handled correctly and generate the same output
|
||||
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_expansion_in_processing_multiimage(self):
|
||||
model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
|
||||
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
|
||||
"llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True
|
||||
)
|
||||
processor = AutoProcessor.from_pretrained(model_id)
|
||||
|
||||
prompt = "USER: <image><image>\nDescribe the similarity between the two images:\nASSISTANT:"
|
||||
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
||||
deer_image = Image.open(
|
||||
requests.get(
|
||||
"https://4.img-dpreview.com/files/p/TS560x560~forums/56876524/03975b28741443319e9a94615e35667e",
|
||||
stream=True,
|
||||
).raw
|
||||
)
|
||||
|
||||
# check processing with expansion of inputs
|
||||
processor.vision_feature_select_strategy = "default"
|
||||
processor.patch_size = 14
|
||||
inputs_expanded = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
|
||||
torch_device, torch.float16
|
||||
)
|
||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 3968)
|
||||
|
||||
# check processing without expansion of inputs (legacy behavior)
|
||||
processor.vision_feature_select_strategy = None
|
||||
processor.patch_size = None
|
||||
inputs = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
|
||||
torch_device, torch.float16
|
||||
)
|
||||
self.assertTrue(inputs.input_ids.shape[-1] == 22)
|
||||
|
||||
# generate exactly 20 tokens
|
||||
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
|
||||
output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
|
||||
|
||||
# check that both inputs are handled correctly and generate the same output
|
||||
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
|
||||
|
||||
@@ -383,18 +383,19 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True)
|
||||
|
||||
prompt = "USER: <video>Why is this video funny? ASSISTANT:"
|
||||
prompt = "USER: <video>\nWhy is this video funny? ASSISTANT:"
|
||||
video_file = hf_hub_download(
|
||||
repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
|
||||
)
|
||||
video_file = np.load(video_file)
|
||||
inputs = self.processor(prompt, videos=video_file, return_tensors="pt")
|
||||
|
||||
EXPECTED_INPUT_IDS = torch.tensor([[1, 3148, 1001, 29901, 29871, 32001, 3750, 338, 445, 4863, 2090, 1460, 29973, 319, 1799, 9047, 13566, 29901]]) # fmt: skip
|
||||
EXPECTED_INPUT_IDS = torch.tensor([[1, 3148, 1001, 29901, 29871, 32001, 13, 11008, 338, 445, 4863, 2090, 1460, 29973, 319, 1799, 9047, 13566, 29901]]) # fmt: skip
|
||||
|
||||
self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
|
||||
|
||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=20)
|
||||
EXPECTED_DECODED_TEXT = "USER: Why is this video funny? ASSISTANT: The video is funny because the baby is playing with a Wii remote while sitting on a bed" # fmt: skip
|
||||
EXPECTED_DECODED_TEXT = "USER: \nWhy is this video funny? ASSISTANT: The video is funny because it shows a baby sitting on a bed and reading a book, which" # fmt: skip
|
||||
|
||||
self.assertEqual(
|
||||
self.processor.decode(output[0], skip_special_tokens=True),
|
||||
@@ -404,12 +405,11 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_mixed_inputs(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True)
|
||||
|
||||
prompts = [
|
||||
"USER: <image>What are the cats in the image doing? ASSISTANT:",
|
||||
"USER: <video>Why is this video funny? ASSISTANT:",
|
||||
"USER: <image>\nWhat are the cats in the image doing? ASSISTANT:",
|
||||
"USER: <video>\nWhy is this video funny? ASSISTANT:",
|
||||
]
|
||||
video_file = hf_hub_download(
|
||||
repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
|
||||
@@ -422,8 +422,8 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=20)
|
||||
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
'USER: What are the cats in the image doing? ASSISTANT: The cats in the image are lying down on a red couch, possibly sleeping or rest',
|
||||
'USER: Why is this video funny? ASSISTANT: The video is funny because the baby is playing with a Wii remote while sitting on a bed'
|
||||
'USER: \nWhat are the cats in the image doing? ASSISTANT: The cats in the image are sleeping or resting on a couch.',
|
||||
'USER: \nWhy is this video funny? ASSISTANT: The video is funny because it shows a baby sitting on a bed and reading a book. The'
|
||||
] # fmt: skip
|
||||
|
||||
self.assertEqual(
|
||||
@@ -434,12 +434,10 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_llama(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
|
||||
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True)
|
||||
processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
|
||||
|
||||
prompt = "USER: <video>Describe the video in details. ASSISTANT:"
|
||||
prompt = "USER: <video>\nDescribe the video in details. ASSISTANT:"
|
||||
video_file = hf_hub_download(
|
||||
repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
|
||||
)
|
||||
@@ -447,11 +445,11 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
inputs = self.processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
|
||||
EXPECTED_DECODED_TEXT = "USER: Describe the video in details. ASSISTANT: The video features a young child sitting on a bed, holding a book and reading it. " \
|
||||
"The child appears to be enjoying the book, as they are fully engaged in the reading process. The bed is located in a bedroom, and there is a chair nearby. " \
|
||||
"The child is wearing a light blue shirt and pink pants, and they have glasses on. The room is well-lit, and there is a clock on the wall. The child seems " \
|
||||
"to be in a comfortable and relaxed environment, which is conducive to reading and learning. Overall, the video captures a heartwarming moment of a child " \
|
||||
"engaging in a simple yet essential activity, which is reading." # fmt: skip
|
||||
EXPECTED_DECODED_TEXT = "USER: \nDescribe the video in details. ASSISTANT: The video features a young child sitting on a bed, holding a book and reading it. " \
|
||||
"The child appears to be enjoying the book, as they are fully engaged in the activity. The bed is located in a bedroom, and there is a chair nearby. The " \
|
||||
"child is wearing a blue shirt and glasses, which suggests that they might have a visual impairment. The room is well-lit, and there is a clock on the wall, " \
|
||||
"indicating the time. The child's focus on the book indicates that they are interested in the content and are actively participating in the reading process. " \
|
||||
"Overall, the video captures a heartwarming moment of a child engaging in a simple yet essential activity, which is reading." # fmt: skip
|
||||
|
||||
self.assertEqual(
|
||||
processor.decode(output[0], skip_special_tokens=True),
|
||||
@@ -461,15 +459,13 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_llama_batched(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
|
||||
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True)
|
||||
processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
|
||||
processor.tokenizer.padding_side = "left"
|
||||
|
||||
prompts = [
|
||||
"USER: <video>What is the baby doing? ASSISTANT:",
|
||||
"USER: <video>Who is sitting next to the woman? ASSISTANT:",
|
||||
"USER: <video>\nWhat is the baby doing? ASSISTANT:",
|
||||
"USER: <video>\nWho is sitting next to the woman? ASSISTANT:",
|
||||
]
|
||||
video_1 = np.load(
|
||||
hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset")
|
||||
@@ -483,48 +479,12 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
output = model.generate(**inputs, max_new_tokens=20)
|
||||
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
'USER: What is the baby doing? ASSISTANT: The baby is sitting on a bed and reading a book.Ъ',
|
||||
'USER: Who is sitting next to the woman? ASSISTANT: A small dog is sitting next to the woman.Ъ'
|
||||
'USER: \nWhat is the baby doing? ASSISTANT: The baby is sitting on a bed and reading a book.',
|
||||
'USER: \nWho is sitting next to the woman? ASSISTANT: A small dog is sitting next to the woman.'
|
||||
] # fmt: skip
|
||||
|
||||
self.assertEqual(processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_llama_batched_regression(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
|
||||
# Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
|
||||
model = VideoLlavaForConditionalGeneration.from_pretrained(
|
||||
"LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True, attn_implementation="eager"
|
||||
)
|
||||
processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", pad_token="<pad>")
|
||||
processor.tokenizer.padding_side = "left"
|
||||
|
||||
prompts = [
|
||||
"USER: <video>What is the baby doing? ASSISTANT:",
|
||||
"USER: <video>Who is sitting next to the woman? ASSISTANT: A small dog is sitting next to the woman. USER: <video>What about this video? ASSITANT:",
|
||||
]
|
||||
video_1 = np.load(
|
||||
hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset")
|
||||
)
|
||||
video_2 = np.load(
|
||||
hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="video_demo_2.npy", repo_type="dataset")
|
||||
)
|
||||
|
||||
inputs = processor(prompts, videos=[video_1, video_2, video_1], return_tensors="pt", padding=True)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=20)
|
||||
|
||||
# fmt: off
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
'USER: What is the baby doing? ASSISTANT: The baby is sitting on a bed and reading a book.Ъ',
|
||||
'USER: Who is sitting next to the woman? ASSISTANT: A small dog is sitting next to the woman. USER: What about this video? ASSITANT: The video shows a baby sitting on a bed, reading a book. The baby is wearing glass'
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
self.assertEqual(processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_video_llava_index_error_bug(self):
|
||||
@@ -552,32 +512,23 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@require_torch_gpu
|
||||
def test_video_llava_merge_inputs_error_bug(self):
|
||||
# This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
|
||||
model = VideoLlavaForConditionalGeneration.from_pretrained(
|
||||
"LanguageBind/Video-LLaVA-7B-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True
|
||||
).to(torch_device)
|
||||
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True)
|
||||
|
||||
# Simulate some user inputs
|
||||
pixel_values_videos = torch.randn(
|
||||
(2, 8, 3, 224, 224),
|
||||
(1, 8, 3, 224, 224),
|
||||
dtype=torch.float,
|
||||
device=torch_device,
|
||||
)
|
||||
# fmt: off
|
||||
input_ids = torch.tensor(
|
||||
[
|
||||
[
|
||||
32001, 32001, 1, 15043, 7084, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 29871, 13, 7900
|
||||
],
|
||||
[
|
||||
1, 15043, 7084, 29901, 29871, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 29871, 13, 7900
|
||||
],
|
||||
],
|
||||
[[32002, 32002, 1, 15043, 7084, 32001, 29871, 13, 7900]],
|
||||
dtype=torch.long,
|
||||
device=torch_device,
|
||||
)
|
||||
# fmt: on
|
||||
attention_mask = torch.tensor(
|
||||
[[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
|
||||
[[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
|
||||
dtype=torch.long,
|
||||
device=torch_device,
|
||||
)
|
||||
@@ -591,6 +542,36 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
).loss
|
||||
loss.backward()
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_expansion_in_processing_images(self):
|
||||
model_id = "LanguageBind/Video-LLaVA-7B-hf"
|
||||
model = VideoLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
||||
processor = VideoLlavaProcessor.from_pretrained(model_id)
|
||||
|
||||
prompt = "USER: <image>\nDescribe the image in details. ASSISTANT:"
|
||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
# check processing with expansion of inputs
|
||||
processor.vision_feature_select_strategy = "default"
|
||||
processor.patch_size = 14
|
||||
inputs_expanded = processor(prompt, images=image, return_tensors="pt").to(torch_device, torch.float16)
|
||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 274)
|
||||
|
||||
# check processing without expansion of inputs (legacy behavior)
|
||||
processor.vision_feature_select_strategy = None
|
||||
processor.patch_size = None
|
||||
inputs = processor(prompt, images=image, return_tensors="pt").to(torch_device, torch.float16)
|
||||
self.assertTrue(inputs.input_ids.shape[-1] == 19)
|
||||
|
||||
# generate exactly 20 tokens
|
||||
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
|
||||
output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
|
||||
|
||||
# check that both inputs are handled correctly and generate the same output
|
||||
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_expansion_in_processing(self):
|
||||
@@ -598,7 +579,7 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
model = VideoLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
||||
processor = VideoLlavaProcessor.from_pretrained(model_id)
|
||||
|
||||
prompt = "USER: <video>Describe the video in details. ASSISTANT:"
|
||||
prompt = "USER: <video>\nDescribe the video in details. ASSISTANT:"
|
||||
video_file = hf_hub_download(
|
||||
repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
|
||||
)
|
||||
@@ -608,13 +589,13 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
processor.vision_feature_select_strategy = "default"
|
||||
processor.patch_size = 14
|
||||
inputs_expanded = processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
|
||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2073)
|
||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2074)
|
||||
|
||||
# check processing without expansion of inputs (legacy behavior)
|
||||
processor.vision_feature_select_strategy = None
|
||||
processor.patch_size = None
|
||||
inputs = processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
|
||||
self.assertTrue(inputs.input_ids.shape[-1] == 18)
|
||||
self.assertTrue(inputs.input_ids.shape[-1] == 19)
|
||||
|
||||
# generate exactly 20 tokens
|
||||
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
|
||||
|
||||
@@ -271,26 +271,23 @@ class VipLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
def test_vipllava_merge_inputs_error_bug(self):
|
||||
# This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
|
||||
model_id = "llava-hf/vip-llava-7b-hf"
|
||||
model = VipLlavaForConditionalGeneration.from_pretrained(
|
||||
model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True
|
||||
).to(torch_device)
|
||||
model = VipLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
||||
|
||||
# Simulate some user inputs
|
||||
pixel_values = torch.randn(
|
||||
(2, 3, 336, 336),
|
||||
(1, 3, 336, 336),
|
||||
dtype=torch.float,
|
||||
device=torch_device,
|
||||
)
|
||||
input_ids = torch.tensor(
|
||||
[
|
||||
[32001, 32001, 1, 15043, 7084, 32000, 29871, 13, 7900],
|
||||
[1, 15043, 7084, 29901, 29871, 32000, 29871, 13, 7900],
|
||||
],
|
||||
dtype=torch.long,
|
||||
device=torch_device,
|
||||
)
|
||||
attention_mask = torch.tensor(
|
||||
[[0, 0, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]],
|
||||
[[0, 0, 1, 1, 1, 1, 1, 1, 1]],
|
||||
dtype=torch.long,
|
||||
device=torch_device,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user