VLM: fixes after refactor (#32907)
* leave only half of the changes * fix tests * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava * fix tests, first try * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava * fix, second try * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava * fix * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava
This commit is contained in:
committed by
GitHub
parent
f24f084329
commit
7d2d6ce9cb
@@ -383,18 +383,19 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True)
|
||||
|
||||
prompt = "USER: <video>Why is this video funny? ASSISTANT:"
|
||||
prompt = "USER: <video>\nWhy is this video funny? ASSISTANT:"
|
||||
video_file = hf_hub_download(
|
||||
repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
|
||||
)
|
||||
video_file = np.load(video_file)
|
||||
inputs = self.processor(prompt, videos=video_file, return_tensors="pt")
|
||||
|
||||
EXPECTED_INPUT_IDS = torch.tensor([[1, 3148, 1001, 29901, 29871, 32001, 3750, 338, 445, 4863, 2090, 1460, 29973, 319, 1799, 9047, 13566, 29901]]) # fmt: skip
|
||||
EXPECTED_INPUT_IDS = torch.tensor([[1, 3148, 1001, 29901, 29871, 32001, 13, 11008, 338, 445, 4863, 2090, 1460, 29973, 319, 1799, 9047, 13566, 29901]]) # fmt: skip
|
||||
|
||||
self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
|
||||
|
||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=20)
|
||||
EXPECTED_DECODED_TEXT = "USER: Why is this video funny? ASSISTANT: The video is funny because the baby is playing with a Wii remote while sitting on a bed" # fmt: skip
|
||||
EXPECTED_DECODED_TEXT = "USER: \nWhy is this video funny? ASSISTANT: The video is funny because it shows a baby sitting on a bed and reading a book, which" # fmt: skip
|
||||
|
||||
self.assertEqual(
|
||||
self.processor.decode(output[0], skip_special_tokens=True),
|
||||
@@ -404,12 +405,11 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_mixed_inputs(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True)
|
||||
|
||||
prompts = [
|
||||
"USER: <image>What are the cats in the image doing? ASSISTANT:",
|
||||
"USER: <video>Why is this video funny? ASSISTANT:",
|
||||
"USER: <image>\nWhat are the cats in the image doing? ASSISTANT:",
|
||||
"USER: <video>\nWhy is this video funny? ASSISTANT:",
|
||||
]
|
||||
video_file = hf_hub_download(
|
||||
repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
|
||||
@@ -422,8 +422,8 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=20)
|
||||
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
'USER: What are the cats in the image doing? ASSISTANT: The cats in the image are lying down on a red couch, possibly sleeping or rest',
|
||||
'USER: Why is this video funny? ASSISTANT: The video is funny because the baby is playing with a Wii remote while sitting on a bed'
|
||||
'USER: \nWhat are the cats in the image doing? ASSISTANT: The cats in the image are sleeping or resting on a couch.',
|
||||
'USER: \nWhy is this video funny? ASSISTANT: The video is funny because it shows a baby sitting on a bed and reading a book. The'
|
||||
] # fmt: skip
|
||||
|
||||
self.assertEqual(
|
||||
@@ -434,12 +434,10 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_llama(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
|
||||
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True)
|
||||
processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
|
||||
|
||||
prompt = "USER: <video>Describe the video in details. ASSISTANT:"
|
||||
prompt = "USER: <video>\nDescribe the video in details. ASSISTANT:"
|
||||
video_file = hf_hub_download(
|
||||
repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
|
||||
)
|
||||
@@ -447,11 +445,11 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
inputs = self.processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
|
||||
EXPECTED_DECODED_TEXT = "USER: Describe the video in details. ASSISTANT: The video features a young child sitting on a bed, holding a book and reading it. " \
|
||||
"The child appears to be enjoying the book, as they are fully engaged in the reading process. The bed is located in a bedroom, and there is a chair nearby. " \
|
||||
"The child is wearing a light blue shirt and pink pants, and they have glasses on. The room is well-lit, and there is a clock on the wall. The child seems " \
|
||||
"to be in a comfortable and relaxed environment, which is conducive to reading and learning. Overall, the video captures a heartwarming moment of a child " \
|
||||
"engaging in a simple yet essential activity, which is reading." # fmt: skip
|
||||
EXPECTED_DECODED_TEXT = "USER: \nDescribe the video in details. ASSISTANT: The video features a young child sitting on a bed, holding a book and reading it. " \
|
||||
"The child appears to be enjoying the book, as they are fully engaged in the activity. The bed is located in a bedroom, and there is a chair nearby. The " \
|
||||
"child is wearing a blue shirt and glasses, which suggests that they might have a visual impairment. The room is well-lit, and there is a clock on the wall, " \
|
||||
"indicating the time. The child's focus on the book indicates that they are interested in the content and are actively participating in the reading process. " \
|
||||
"Overall, the video captures a heartwarming moment of a child engaging in a simple yet essential activity, which is reading." # fmt: skip
|
||||
|
||||
self.assertEqual(
|
||||
processor.decode(output[0], skip_special_tokens=True),
|
||||
@@ -461,15 +459,13 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_llama_batched(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
|
||||
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True)
|
||||
processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
|
||||
processor.tokenizer.padding_side = "left"
|
||||
|
||||
prompts = [
|
||||
"USER: <video>What is the baby doing? ASSISTANT:",
|
||||
"USER: <video>Who is sitting next to the woman? ASSISTANT:",
|
||||
"USER: <video>\nWhat is the baby doing? ASSISTANT:",
|
||||
"USER: <video>\nWho is sitting next to the woman? ASSISTANT:",
|
||||
]
|
||||
video_1 = np.load(
|
||||
hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset")
|
||||
@@ -483,48 +479,12 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
output = model.generate(**inputs, max_new_tokens=20)
|
||||
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
'USER: What is the baby doing? ASSISTANT: The baby is sitting on a bed and reading a book.Ъ',
|
||||
'USER: Who is sitting next to the woman? ASSISTANT: A small dog is sitting next to the woman.Ъ'
|
||||
'USER: \nWhat is the baby doing? ASSISTANT: The baby is sitting on a bed and reading a book.',
|
||||
'USER: \nWho is sitting next to the woman? ASSISTANT: A small dog is sitting next to the woman.'
|
||||
] # fmt: skip
|
||||
|
||||
self.assertEqual(processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_llama_batched_regression(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
|
||||
# Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
|
||||
model = VideoLlavaForConditionalGeneration.from_pretrained(
|
||||
"LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True, attn_implementation="eager"
|
||||
)
|
||||
processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", pad_token="<pad>")
|
||||
processor.tokenizer.padding_side = "left"
|
||||
|
||||
prompts = [
|
||||
"USER: <video>What is the baby doing? ASSISTANT:",
|
||||
"USER: <video>Who is sitting next to the woman? ASSISTANT: A small dog is sitting next to the woman. USER: <video>What about this video? ASSITANT:",
|
||||
]
|
||||
video_1 = np.load(
|
||||
hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset")
|
||||
)
|
||||
video_2 = np.load(
|
||||
hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="video_demo_2.npy", repo_type="dataset")
|
||||
)
|
||||
|
||||
inputs = processor(prompts, videos=[video_1, video_2, video_1], return_tensors="pt", padding=True)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=20)
|
||||
|
||||
# fmt: off
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
'USER: What is the baby doing? ASSISTANT: The baby is sitting on a bed and reading a book.Ъ',
|
||||
'USER: Who is sitting next to the woman? ASSISTANT: A small dog is sitting next to the woman. USER: What about this video? ASSITANT: The video shows a baby sitting on a bed, reading a book. The baby is wearing glass'
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
self.assertEqual(processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_video_llava_index_error_bug(self):
|
||||
@@ -552,32 +512,23 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@require_torch_gpu
|
||||
def test_video_llava_merge_inputs_error_bug(self):
|
||||
# This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
|
||||
model = VideoLlavaForConditionalGeneration.from_pretrained(
|
||||
"LanguageBind/Video-LLaVA-7B-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True
|
||||
).to(torch_device)
|
||||
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True)
|
||||
|
||||
# Simulate some user inputs
|
||||
pixel_values_videos = torch.randn(
|
||||
(2, 8, 3, 224, 224),
|
||||
(1, 8, 3, 224, 224),
|
||||
dtype=torch.float,
|
||||
device=torch_device,
|
||||
)
|
||||
# fmt: off
|
||||
input_ids = torch.tensor(
|
||||
[
|
||||
[
|
||||
32001, 32001, 1, 15043, 7084, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 29871, 13, 7900
|
||||
],
|
||||
[
|
||||
1, 15043, 7084, 29901, 29871, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 29871, 13, 7900
|
||||
],
|
||||
],
|
||||
[[32002, 32002, 1, 15043, 7084, 32001, 29871, 13, 7900]],
|
||||
dtype=torch.long,
|
||||
device=torch_device,
|
||||
)
|
||||
# fmt: on
|
||||
attention_mask = torch.tensor(
|
||||
[[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
|
||||
[[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
|
||||
dtype=torch.long,
|
||||
device=torch_device,
|
||||
)
|
||||
@@ -591,6 +542,36 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
).loss
|
||||
loss.backward()
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_expansion_in_processing_images(self):
|
||||
model_id = "LanguageBind/Video-LLaVA-7B-hf"
|
||||
model = VideoLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
||||
processor = VideoLlavaProcessor.from_pretrained(model_id)
|
||||
|
||||
prompt = "USER: <image>\nDescribe the image in details. ASSISTANT:"
|
||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
# check processing with expansion of inputs
|
||||
processor.vision_feature_select_strategy = "default"
|
||||
processor.patch_size = 14
|
||||
inputs_expanded = processor(prompt, images=image, return_tensors="pt").to(torch_device, torch.float16)
|
||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 274)
|
||||
|
||||
# check processing without expansion of inputs (legacy behavior)
|
||||
processor.vision_feature_select_strategy = None
|
||||
processor.patch_size = None
|
||||
inputs = processor(prompt, images=image, return_tensors="pt").to(torch_device, torch.float16)
|
||||
self.assertTrue(inputs.input_ids.shape[-1] == 19)
|
||||
|
||||
# generate exactly 20 tokens
|
||||
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
|
||||
output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
|
||||
|
||||
# check that both inputs are handled correctly and generate the same output
|
||||
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_expansion_in_processing(self):
|
||||
@@ -598,7 +579,7 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
model = VideoLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
||||
processor = VideoLlavaProcessor.from_pretrained(model_id)
|
||||
|
||||
prompt = "USER: <video>Describe the video in details. ASSISTANT:"
|
||||
prompt = "USER: <video>\nDescribe the video in details. ASSISTANT:"
|
||||
video_file = hf_hub_download(
|
||||
repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
|
||||
)
|
||||
@@ -608,13 +589,13 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
processor.vision_feature_select_strategy = "default"
|
||||
processor.patch_size = 14
|
||||
inputs_expanded = processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
|
||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2073)
|
||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2074)
|
||||
|
||||
# check processing without expansion of inputs (legacy behavior)
|
||||
processor.vision_feature_select_strategy = None
|
||||
processor.patch_size = None
|
||||
inputs = processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
|
||||
self.assertTrue(inputs.input_ids.shape[-1] == 18)
|
||||
self.assertTrue(inputs.input_ids.shape[-1] == 19)
|
||||
|
||||
# generate exactly 20 tokens
|
||||
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
|
||||
|
||||
Reference in New Issue
Block a user