VLM: fixes after refactor (#32907)

* leave only half of the changes * fix tests * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava * fix tests, first try * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava * fix, second try * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava * fix * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava
2024-09-10 12:02:37 +02:00
parent f24f084329
commit 7d2d6ce9cb
15 changed files with 577 additions and 500 deletions
--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@@ -383,18 +383,19 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
        # Let' s make sure we test the preprocessing to replace what is used
        model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True)

-        prompt = "USER: <video>Why is this video funny? ASSISTANT:"
+        prompt = "USER: <video>\nWhy is this video funny? ASSISTANT:"
        video_file = hf_hub_download(
            repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
        )
        video_file = np.load(video_file)
        inputs = self.processor(prompt, videos=video_file, return_tensors="pt")

-        EXPECTED_INPUT_IDS = torch.tensor([[1,  3148, 1001, 29901, 29871, 32001, 3750, 338, 445, 4863, 2090, 1460, 29973, 319, 1799, 9047, 13566, 29901]])  # fmt: skip
+        EXPECTED_INPUT_IDS = torch.tensor([[1,  3148, 1001, 29901, 29871, 32001, 13, 11008, 338, 445, 4863, 2090, 1460, 29973, 319, 1799, 9047, 13566, 29901]])  # fmt: skip
+
        self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))

        output = model.generate(**inputs, do_sample=False, max_new_tokens=20)
-        EXPECTED_DECODED_TEXT = "USER:  Why is this video funny? ASSISTANT: The video is funny because the baby is playing with a Wii remote while sitting on a bed"  # fmt: skip
+        EXPECTED_DECODED_TEXT = "USER: \nWhy is this video funny? ASSISTANT: The video is funny because it shows a baby sitting on a bed and reading a book, which"  # fmt: skip

        self.assertEqual(
            self.processor.decode(output[0], skip_special_tokens=True),
@@ -404,12 +405,11 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
    @slow
    @require_bitsandbytes
    def test_small_model_integration_test_mixed_inputs(self):
-        # Let' s make sure we test the preprocessing to replace what is used
        model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True)

        prompts = [
-            "USER: <image>What are the cats in the image doing? ASSISTANT:",
-            "USER: <video>Why is this video funny? ASSISTANT:",
+            "USER: <image>\nWhat are the cats in the image doing? ASSISTANT:",
+            "USER: <video>\nWhy is this video funny? ASSISTANT:",
        ]
        video_file = hf_hub_download(
            repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
@@ -422,8 +422,8 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
        output = model.generate(**inputs, do_sample=False, max_new_tokens=20)

        EXPECTED_DECODED_TEXT = [
-            'USER:  What are the cats in the image doing? ASSISTANT: The cats in the image are lying down on a red couch, possibly sleeping or rest',
-            'USER:  Why is this video funny? ASSISTANT: The video is funny because the baby is playing with a Wii remote while sitting on a bed'
+            'USER: \nWhat are the cats in the image doing? ASSISTANT: The cats in the image are sleeping or resting on a couch.',
+            'USER: \nWhy is this video funny? ASSISTANT: The video is funny because it shows a baby sitting on a bed and reading a book. The'
        ]  # fmt: skip

        self.assertEqual(
@@ -434,12 +434,10 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
    @slow
    @require_bitsandbytes
    def test_small_model_integration_test_llama(self):
-        # Let' s make sure we test the preprocessing to replace what is used
-
        model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True)
        processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")

-        prompt = "USER: <video>Describe the video in details. ASSISTANT:"
+        prompt = "USER: <video>\nDescribe the video in details. ASSISTANT:"
        video_file = hf_hub_download(
            repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
        )
@@ -447,11 +445,11 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
        inputs = self.processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)

        output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
-        EXPECTED_DECODED_TEXT = "USER:  Describe the video in details. ASSISTANT: The video features a young child sitting on a bed, holding a book and reading it. " \
-            "The child appears to be enjoying the book, as they are fully engaged in the reading process. The bed is located in a bedroom, and there is a chair nearby. " \
-            "The child is wearing a light blue shirt and pink pants, and they have glasses on. The room is well-lit, and there is a clock on the wall. The child seems " \
-            "to be in a comfortable and relaxed environment, which is conducive to reading and learning. Overall, the video captures a heartwarming moment of a child " \
-            "engaging in a simple yet essential activity, which is reading."  # fmt: skip
+        EXPECTED_DECODED_TEXT = "USER: \nDescribe the video in details. ASSISTANT: The video features a young child sitting on a bed, holding a book and reading it. " \
+            "The child appears to be enjoying the book, as they are fully engaged in the activity. The bed is located in a bedroom, and there is a chair nearby. The " \
+            "child is wearing a blue shirt and glasses, which suggests that they might have a visual impairment. The room is well-lit, and there is a clock on the wall, " \
+            "indicating the time. The child's focus on the book indicates that they are interested in the content and are actively participating in the reading process. " \
+            "Overall, the video captures a heartwarming moment of a child engaging in a simple yet essential activity, which is reading."  # fmt: skip

        self.assertEqual(
            processor.decode(output[0], skip_special_tokens=True),
@@ -461,15 +459,13 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
    @slow
    @require_bitsandbytes
    def test_small_model_integration_test_llama_batched(self):
-        # Let' s make sure we test the preprocessing to replace what is used
-
        model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True)
        processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
        processor.tokenizer.padding_side = "left"

        prompts = [
-            "USER: <video>What is the baby doing? ASSISTANT:",
-            "USER: <video>Who is sitting next to the woman? ASSISTANT:",
+            "USER: <video>\nWhat is the baby doing? ASSISTANT:",
+            "USER: <video>\nWho is sitting next to the woman? ASSISTANT:",
        ]
        video_1 = np.load(
            hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset")
@@ -483,48 +479,12 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
        output = model.generate(**inputs, max_new_tokens=20)

        EXPECTED_DECODED_TEXT = [
-            'USER:  What is the baby doing? ASSISTANT: The baby is sitting on a bed and reading a book.Ъ',
-            'USER:  Who is sitting next to the woman? ASSISTANT: A small dog is sitting next to the woman.Ъ'
+            'USER: \nWhat is the baby doing? ASSISTANT: The baby is sitting on a bed and reading a book.',
+            'USER: \nWho is sitting next to the woman? ASSISTANT: A small dog is sitting next to the woman.'
        ]  # fmt: skip

        self.assertEqual(processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)

-    @slow
-    @require_bitsandbytes
-    def test_small_model_integration_test_llama_batched_regression(self):
-        # Let' s make sure we test the preprocessing to replace what is used
-
-        # Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
-        model = VideoLlavaForConditionalGeneration.from_pretrained(
-            "LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True, attn_implementation="eager"
-        )
-        processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", pad_token="<pad>")
-        processor.tokenizer.padding_side = "left"
-
-        prompts = [
-            "USER: <video>What is the baby doing? ASSISTANT:",
-            "USER: <video>Who is sitting next to the woman? ASSISTANT: A small dog is sitting next to the woman. USER: <video>What about this video? ASSITANT:",
-        ]
-        video_1 = np.load(
-            hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset")
-        )
-        video_2 = np.load(
-            hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="video_demo_2.npy", repo_type="dataset")
-        )
-
-        inputs = processor(prompts, videos=[video_1, video_2, video_1], return_tensors="pt", padding=True)
-
-        output = model.generate(**inputs, max_new_tokens=20)
-
-        # fmt: off
-        EXPECTED_DECODED_TEXT = [
-            'USER:  What is the baby doing? ASSISTANT: The baby is sitting on a bed and reading a book.Ъ',
-            'USER:  Who is sitting next to the woman? ASSISTANT: A small dog is sitting next to the woman. USER:  What about this video? ASSITANT: The video shows a baby sitting on a bed, reading a book. The baby is wearing glass'
-        ]
-        # fmt: on
-
-        self.assertEqual(processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
-
    @slow
    @require_bitsandbytes
    def test_video_llava_index_error_bug(self):
@@ -552,32 +512,23 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
    @require_torch_gpu
    def test_video_llava_merge_inputs_error_bug(self):
        # This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
-        model = VideoLlavaForConditionalGeneration.from_pretrained(
-            "LanguageBind/Video-LLaVA-7B-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True
-        ).to(torch_device)
+        model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True)

        # Simulate some user inputs
        pixel_values_videos = torch.randn(
-            (2, 8, 3, 224, 224),
+            (1, 8, 3, 224, 224),
            dtype=torch.float,
            device=torch_device,
        )
        # fmt: off
        input_ids = torch.tensor(
-            [
-                [
-                    32001, 32001, 1, 15043, 7084, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 29871, 13, 7900
-                ],
-                [
-                    1, 15043, 7084, 29901, 29871, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 29871, 13, 7900
-                ],
-            ],
+            [[32002, 32002, 1, 15043, 7084, 32001, 29871, 13, 7900]],
            dtype=torch.long,
            device=torch_device,
        )
        # fmt: on
        attention_mask = torch.tensor(
-            [[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
+            [[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
            dtype=torch.long,
            device=torch_device,
        )
@@ -591,6 +542,36 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
        ).loss
        loss.backward()

+    @slow
+    @require_bitsandbytes
+    def test_expansion_in_processing_images(self):
+        model_id = "LanguageBind/Video-LLaVA-7B-hf"
+        model = VideoLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = VideoLlavaProcessor.from_pretrained(model_id)
+
+        prompt = "USER: <image>\nDescribe the image in details. ASSISTANT:"
+        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+
+        # check processing with expansion of inputs
+        processor.vision_feature_select_strategy = "default"
+        processor.patch_size = 14
+        inputs_expanded = processor(prompt, images=image, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 274)
+
+        # check processing without expansion of inputs (legacy behavior)
+        processor.vision_feature_select_strategy = None
+        processor.patch_size = None
+        inputs = processor(prompt, images=image, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs.input_ids.shape[-1] == 19)
+
+        # generate exactly 20 tokens
+        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
+        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
+
+        # check that both inputs are handled correctly and generate the same output
+        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
+
    @slow
    @require_bitsandbytes
    def test_expansion_in_processing(self):
@@ -598,7 +579,7 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
        model = VideoLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
        processor = VideoLlavaProcessor.from_pretrained(model_id)

-        prompt = "USER: <video>Describe the video in details. ASSISTANT:"
+        prompt = "USER: <video>\nDescribe the video in details. ASSISTANT:"
        video_file = hf_hub_download(
            repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
        )
@@ -608,13 +589,13 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
        processor.vision_feature_select_strategy = "default"
        processor.patch_size = 14
        inputs_expanded = processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
-        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2073)
+        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2074)

        # check processing without expansion of inputs (legacy behavior)
        processor.vision_feature_select_strategy = None
        processor.patch_size = None
        inputs = processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
-        self.assertTrue(inputs.input_ids.shape[-1] == 18)
+        self.assertTrue(inputs.input_ids.shape[-1] == 19)

        # generate exactly 20 tokens
        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)