VLMs: major clean up 🧼 (#34502)

only lllava models are modified
This commit is contained in:
Raushan Turganbay
2025-01-08 10:35:23 +01:00
committed by GitHub
parent 7176e06b52
commit d1681ec2b6
19 changed files with 197 additions and 1028 deletions

View File

@@ -17,7 +17,6 @@
import unittest
import numpy as np
import requests
from huggingface_hub import hf_hub_download
from transformers import (
@@ -543,107 +542,3 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
model(**inputs_batched, output_hidden_states=True)
self.assertIn("Padding side is set to 'right' but the model is in inference mode. For correct", logs)
@slow
@require_bitsandbytes
def test_expansion_in_processing(self):
model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
"llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True
)
processor = AutoProcessor.from_pretrained(model_id)
# check processing with expansion of inputs
processor.vision_feature_select_strategy = "default"
processor.patch_size = 14
processor.num_additional_image_tokens = 1
inputs_expanded = processor(self.prompt_video, videos=[self.video], return_tensors="pt").to(torch_device)
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 1170)
# check processing without expansion of inputs (legacy behavior)
processor.vision_feature_select_strategy = None
processor.patch_size = None
processor.num_additional_image_tokens = None
inputs = processor(self.prompt_video, videos=[self.video], return_tensors="pt").to(torch_device)
self.assertTrue(inputs.input_ids.shape[-1] == 19)
# generate exactly 20 tokens
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
# check that both inputs are handled correctly and generate the same output
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
@slow
@require_bitsandbytes
def test_expansion_in_processing_images(self):
model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
"llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True
)
processor = AutoProcessor.from_pretrained(model_id)
# check processing with expansion of inputs
processor.vision_feature_select_strategy = "default"
processor.patch_size = 14
processor.num_additional_image_tokens = 1
inputs_expanded = processor(self.prompt_image, images=[self.image], return_tensors="pt").to(torch_device)
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2652)
# check processing without expansion of inputs (legacy behavior)
processor.vision_feature_select_strategy = None
processor.patch_size = None
processor.num_additional_image_tokens = None
inputs = processor(self.prompt_image, images=[self.image], return_tensors="pt").to(torch_device)
self.assertTrue(inputs.input_ids.shape[-1] == 19)
# generate exactly 20 tokens
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
# check that both inputs are handled correctly and generate the same output
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
@slow
@require_bitsandbytes
def test_expansion_in_processing_multiimage(self):
model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
"llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True
)
processor = AutoProcessor.from_pretrained(model_id)
prompt = "USER: <image><image>\nDescribe the similarity between the two images:\nASSISTANT:"
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)
deer_image = Image.open(
requests.get(
"https://4.img-dpreview.com/files/p/TS560x560~forums/56876524/03975b28741443319e9a94615e35667e",
stream=True,
).raw
)
# check processing with expansion of inputs
processor.vision_feature_select_strategy = "default"
processor.patch_size = 14
processor.num_additional_image_tokens = 1
inputs_expanded = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
torch_device, torch.float16
)
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 3968)
# check processing without expansion of inputs (legacy behavior)
processor.vision_feature_select_strategy = None
processor.patch_size = None
processor.num_additional_image_tokens = None
inputs = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
torch_device, torch.float16
)
self.assertTrue(inputs.input_ids.shape[-1] == 22)
# generate exactly 20 tokens
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
# check that both inputs are handled correctly and generate the same output
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())