Correct llava mask & fix missing setter for vocab_size (#29389)

* correct llava mask

* fix vipllava as wlel

* mask out embedding for padding tokens

* add test

* fix style

* add setter

* fix test on suggestion
This commit is contained in:
fxmarty
2024-03-22 19:57:08 +08:00
committed by GitHub
parent aa17cf986f
commit 13b23704a8
5 changed files with 74 additions and 10 deletions

View File

@@ -27,7 +27,14 @@ from transformers import (
is_torch_available,
is_vision_available,
)
from transformers.testing_utils import require_bitsandbytes, require_torch, require_torch_gpu, slow, torch_device
from transformers.testing_utils import (
require_bitsandbytes,
require_torch,
require_torch_gpu,
require_vision,
slow,
torch_device,
)
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
@@ -470,10 +477,45 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
output = model.generate(**inputs, max_new_tokens=20)
EXPECTED_DECODED_TEXT = ['USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this serene location, one should be cautious about the weather conditions and potential', 'USER: \nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER: \nAnd this?\nASSISTANT: A cat sleeping on a bed.'] # fmt: skip
EXPECTED_DECODED_TEXT = ['USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, which appears to be a dock or pier extending over a body of water', 'USER: \nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER: \nAnd this?\nASSISTANT: A cat sleeping on a bed.'] # fmt: skip
self.assertEqual(processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
@slow
@require_torch
@require_vision
def test_batched_generation(self):
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf").to(torch_device)
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
prompt1 = "<image>\n<image>\nUSER: What's the the difference of two images?\nASSISTANT:"
prompt2 = "<image>\nUSER: Describe the image.\nASSISTANT:"
prompt3 = "<image>\nUSER: Describe the image.\nASSISTANT:"
url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
url2 = "https://images.unsplash.com/photo-1617258683320-61900b281ced?q=80&w=3087&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
image1 = Image.open(requests.get(url1, stream=True).raw)
image2 = Image.open(requests.get(url2, stream=True).raw)
inputs = processor(
text=[prompt1, prompt2, prompt3],
images=[image1, image2, image1, image2],
return_tensors="pt",
padding=True,
).to(torch_device)
model = model.eval()
EXPECTED_OUTPUT = [
"\n \nUSER: What's the the difference of two images?\nASSISTANT: In the two images, the primary difference is the presence of a small dog holding a flower in one",
"\nUSER: Describe the image.\nASSISTANT: The image features a small, fluffy dog sitting on a sidewalk. The dog is holding",
"\nUSER: Describe the image.\nASSISTANT: The image features a lone, adult llama standing on a grassy hill. The llama",
]
generate_ids = model.generate(**inputs, max_new_tokens=20)
outputs = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
self.assertEqual(outputs, EXPECTED_OUTPUT)
@slow
@require_bitsandbytes
def test_llava_index_error_bug(self):