VLM: fixes after refactor (#32907)

* leave only half of the changes

* fix tests

* [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava

* fix tests, first try

* [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava

* fix, second try

* [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava

* fix

* [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava
This commit is contained in:
Raushan Turganbay
2024-09-10 12:02:37 +02:00
committed by GitHub
parent f24f084329
commit 7d2d6ce9cb
15 changed files with 577 additions and 500 deletions

View File

@@ -271,26 +271,23 @@ class VipLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
def test_vipllava_merge_inputs_error_bug(self):
# This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
model_id = "llava-hf/vip-llava-7b-hf"
model = VipLlavaForConditionalGeneration.from_pretrained(
model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True
).to(torch_device)
model = VipLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
# Simulate some user inputs
pixel_values = torch.randn(
(2, 3, 336, 336),
(1, 3, 336, 336),
dtype=torch.float,
device=torch_device,
)
input_ids = torch.tensor(
[
[32001, 32001, 1, 15043, 7084, 32000, 29871, 13, 7900],
[1, 15043, 7084, 29901, 29871, 32000, 29871, 13, 7900],
],
dtype=torch.long,
device=torch_device,
)
attention_mask = torch.tensor(
[[0, 0, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]],
[[0, 0, 1, 1, 1, 1, 1, 1, 1]],
dtype=torch.long,
device=torch_device,
)