Update some tests for torch 2.7.1 (#38701)

* fix 1 * fix 2 * fix 3 * fix 4 * fp16 * break * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2025-06-10 11:46:52 +02:00
parent afdb821318
commit 04cdf83244
5 changed files with 88 additions and 70 deletions
--- a/tests/models/internvl/test_modeling_internvl.py
+++ b/tests/models/internvl/test_modeling_internvl.py
@@ -292,7 +292,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
    def test_qwen2_small_model_integration_generate(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        image = Image.open(requests.get(url, stream=True).raw)
@@ -300,19 +300,20 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        prompt = (
            "<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
        )
-        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
        with torch.no_grad():
            generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
            decoded_output = processor.decode(
                generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
            )
-        expected_output = "The image shows two cats lying on a pink blanket. The cat on the left is a tabby"
+        expected_output = "The image shows two cats lying on a pink surface, which appears to be a bed or couch."
+
        self.assertEqual(decoded_output, expected_output)

    def test_qwen2_small_model_integration_forward(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        image = Image.open(requests.get(url, stream=True).raw)
@@ -320,7 +321,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        prompt = (
            "<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
        )
-        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)

        # Forward
        with torch.inference_mode():
@@ -329,9 +330,9 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        actual_logits = output.logits[0, -1, :5].cpu()
        expected_logits_all = Expectations(
            {
-                ("xpu", 3): torch.tensor([11.7500, 14.7500, 14.1250, 10.5625, 6.7812], dtype=torch.bfloat16),
-                ("cuda", 7): torch.tensor([11.9375, 14.7500, 14.4375, 10.8125,  7.0938], dtype=torch.bfloat16),
-                ("cuda", 8): torch.tensor([11.8750, 14.8125, 14.3125, 10.8125,  6.9375], dtype=torch.bfloat16),
+                ("xpu", 3): torch.tensor([11.7500, 14.7500, 14.1250, 10.5625, 6.7812], dtype=torch.float16),
+                ("cuda", 7): torch.tensor([11.9531, 14.7031, 14.2734, 10.6562,  6.9219], dtype=torch.float16),
+                ("cuda", 8): torch.tensor([11.9609, 14.7188, 14.2734, 10.6484,  6.9141], dtype=torch.float16),
            }
        )  # fmt: skip
        expected_logits = expected_logits_all.get_expectation()
@@ -347,10 +348,10 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
    def test_qwen2_small_model_integration_generate_text_only(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        prompt = "<|im_start|>user\nWrite a haiku<|im_end|>\n<|im_start|>assistant\n"
-        inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
+        inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
        with torch.no_grad():
            generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
            decoded_output = processor.decode(
@@ -360,8 +361,8 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        expected_outputs = Expectations(
            {
                ("xpu", 3): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light.",
-                ("cuda", 7): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light.",
-                ("cuda", 8): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light begins.",
+                ("cuda", 7): 'Whispers of dawn,\nSilent whispers of night,\nPeace in the stillness.',
+                ("cuda", 8): 'Whispers of dawn,\nSilent whispers of night,\nPeace in the stillness.',
            }
        )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
@@ -371,7 +372,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
    def test_qwen2_small_model_integration_generate_chat_template(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        messages = [
            {
@@ -385,20 +386,21 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):

        inputs = processor.apply_chat_template(
            messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
-        ).to(torch_device, dtype=torch.bfloat16)
+        ).to(torch_device, dtype=torch.float16)
        with torch.no_grad():
            generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
            decoded_output = processor.decode(
                generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
            )
-        expected_output = "The image shows two cats lying on a pink blanket. The cat on the left is a tabby"
+        expected_output = "The image shows two cats lying on a pink surface, which appears to be a bed or couch."
+
        self.assertEqual(decoded_output, expected_output)

    @require_deterministic_for_xpu
    def test_qwen2_small_model_integration_batched_generate(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        # Prepare inputs
        prompt = [
@@ -409,7 +411,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)

        inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
-            torch_device, dtype=torch.bfloat16
+            torch_device, dtype=torch.float16
        )

        output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@@ -417,6 +419,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        # Check first output
        decoded_output = processor.decode(output[0], skip_special_tokens=True)
        expected_output = "user\n\nWrite a haiku for this image\nassistant\nSilky lake,  \nWooden pier,  \nNature's peace."  # fmt: skip
+
        self.assertEqual(
            decoded_output,
            expected_output,
@@ -428,7 +431,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        expected_outputs = Expectations(
            {
                ("xpu", 3): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate"',
-                ("cuda", 7): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Arch,"',
+                ("cuda", 7): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate of',
            }
        )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
@@ -442,7 +445,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
    def test_qwen2_small_model_integration_batched_generate_multi_image(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        # Prepare inputs
        prompt = [
@@ -466,7 +469,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        )

        inputs = processor(text=prompt, images=[[image1], [image2, image3]], padding=True, return_tensors="pt").to(
-            torch_device, dtype=torch.bfloat16
+            torch_device, dtype=torch.float16
        )

        output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@@ -548,7 +551,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
    def test_qwen2_small_model_integration_interleaved_images_videos(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, torch_dtype=torch.bfloat16, device_map=torch_device
+            self.small_model_checkpoint, torch_dtype=torch.float16, device_map=torch_device
        )
        messages = [
            [
@@ -600,7 +603,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
            return_tensors="pt",
            padding=True,
            num_frames=8,
-        ).to(torch_device, dtype=torch.bfloat16)
+        ).to(torch_device, dtype=torch.float16)

        output = model.generate(**inputs, do_sample=False, max_new_tokens=25)

@@ -609,10 +612,11 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        expected_outputs = Expectations(
            {
                ("xpu", 3): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n   - The Statue of Liberty is prominently featured on an",
-                ("cuda", 7): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n   - The Statue of Liberty is prominently featured on an",
+                ("cuda", 7): 'user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n   - The Statue of Liberty is prominently featured on an',
            }
        )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
+
        self.assertEqual(
            decoded_output,
            expected_output,
@@ -623,7 +627,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        expected_outputs = Expectations(
            {
                ("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot.",
-                ("cuda", 7): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot.",
+                ("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nA forehand shot',
            }
        )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
@@ -635,7 +639,9 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):

        # Check third output
        decoded_output = processor.decode(output[2], skip_special_tokens=True)
-        expected_output = "user\n\nWrite a haiku for this image\nassistant\nSilky lake,  \nWooden pier,  \nNature's peace."  # fmt: skip
+        expected_output = (
+            "user\n\nWrite a haiku for this image\nassistant\nSilky lake,  \nWooden pier,  \nNature's peace."
+        )
        self.assertEqual(
            decoded_output,
            expected_output,
@@ -657,7 +663,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
    def test_llama_small_model_integration_generate(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        image = Image.open(requests.get(url, stream=True).raw)
@@ -665,7 +671,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
        prompt = (
            "<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
        )
-        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
        with torch.no_grad():
            generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
            decoded_output = processor.decode(
@@ -677,7 +683,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
    def test_llama_small_model_integration_forward(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        image = Image.open(requests.get(url, stream=True).raw)
@@ -685,7 +691,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
        prompt = (
            "<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
        )
-        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)

        # Forward
        with torch.inference_mode():
@@ -695,12 +701,12 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):

        expected_logits_all = Expectations(
            {
-                ("xpu", 3): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.bfloat16),
-                ("cuda", 7): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.bfloat16),
-                ("cuda", 8): torch.tensor([-9.8750,  -0.5117,   1.4297, -10.3750, -10.3750], dtype=torch.bfloat16),
+                ("xpu", 3): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.float16),
+                ("cuda", 7): torch.tensor([-9.8750,  -0.4861,   1.4648, -10.3359, -10.3359], dtype=torch.float16),
+                ("cuda", 8): torch.tensor([-9.8906,  -0.4995,   1.4473, -10.3359, -10.3438], dtype=torch.float16),
            }
        )  # fmt: skip
-        expected_logits = torch.tensor(expected_logits_all.get_expectation(), dtype=torch.bfloat16)
+        expected_logits = torch.tensor(expected_logits_all.get_expectation(), dtype=torch.float16)

        # The original implementation and the transformers implementation do not match exactly, hence the higher tolerance.
        # The difference is likely due to the different implementations of the attention mechanism (different order of operations)
@@ -716,22 +722,30 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
    def test_llama_small_model_integration_generate_text_only(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        prompt = "<|im_start|>user\nWrite a haiku<|im_end|>\n<|im_start|>assistant\n"
-        inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
+        inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
        with torch.no_grad():
            generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
            decoded_output = processor.decode(
                generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
            )
-        expected_output = "Autumn leaves fall,\nNature's breath, a season's sigh,\nSilent woods awake."
+
+        expected_outputs = Expectations(
+            {
+                ("cuda", 7): "Autumn leaves fall,\nNature's breath, a gentle sigh,\nSilent whispers.",
+                ("cuda", 8): "Autumn leaves fall,\nNature's breath, a silent sigh,\nWinter's chill approaches.",
+            }
+        )
+        expected_output = expected_outputs.get_expectation()
+
        self.assertEqual(decoded_output, expected_output)

    def test_llama_small_model_integration_generate_chat_template(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        messages = [
            {
@@ -745,7 +759,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):

        inputs = processor.apply_chat_template(
            messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
-        ).to(torch_device, dtype=torch.bfloat16)
+        ).to(torch_device, dtype=torch.float16)
        with torch.no_grad():
            generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
            decoded_output = processor.decode(
@@ -757,7 +771,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
    def test_llama_small_model_integration_batched_generate(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        # Prepare inputs
        prompt = [
@@ -768,7 +782,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
        image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)

        inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
-            torch_device, dtype=torch.bfloat16
+            torch_device, dtype=torch.float16
        )

        output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@@ -778,11 +792,12 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
        expected_outputs = Expectations(
            {
                ("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace.",
-                ("cuda", 7): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace.",
-                ("cuda", 8): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nA wooden path leads to the sea,\nPeaceful, still waters.",
+                ("cuda", 7): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
+                ("cuda", 8): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
            }
        )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
+
        self.assertEqual(
            decoded_output,
            expected_output,
@@ -791,7 +806,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):

        # Check second output
        decoded_output = processor.decode(output[1], skip_special_tokens=True)
-        expected_output = 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese gate in the background, adorned with red and gold colors and Chinese characters'  # fmt: skip
+        expected_output = "user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese gate in the background, adorned with red and gold colors and Chinese characters"
        self.assertEqual(
            decoded_output,
            expected_output,
@@ -801,7 +816,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
    def test_llama_small_model_integration_batched_generate_multi_image(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        # Prepare inputs
        prompt = [
@@ -825,7 +840,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
        )

        inputs = processor(text=prompt, images=[[image1], [image2, image3]], padding=True, return_tensors="pt").to(
-            torch_device, dtype=torch.bfloat16
+            torch_device, dtype=torch.float16
        )

        output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@@ -833,7 +848,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
        # Check first output
        decoded_output = processor.decode(output[0], skip_special_tokens=True)
        # Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232
-        expected_output = "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace."  # fmt: skip
+        expected_output = "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors."
+
        self.assertEqual(
            decoded_output,
            expected_output,
@@ -842,7 +858,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):

        # Check second output
        decoded_output = processor.decode(output[1], skip_special_tokens=True)
-        expected_output = 'user\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences'  # fmt: skip
+        expected_output = "user\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences"
        self.assertEqual(
            decoded_output,
            expected_output,
@@ -893,7 +909,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
    def test_llama_small_model_integration_interleaved_images_videos(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, torch_dtype=torch.bfloat16, device_map=torch_device
+            self.small_model_checkpoint, torch_dtype=torch.float16, device_map=torch_device
        )
        messages = [
            [
@@ -945,7 +961,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
            return_tensors="pt",
            padding=True,
            num_frames=8,
-        ).to(torch_device, dtype=torch.bfloat16)
+        ).to(torch_device, dtype=torch.float16)

        output = model.generate(**inputs, do_sample=False, max_new_tokens=25)

@@ -954,8 +970,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
        expected_outputs = Expectations(
            {
                ("xpu", 3): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",
-                ("cuda", 7): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",
-                ("cuda", 8): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences",
+                ("cuda", 7): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **',
+                ("cuda", 8): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that there are no',
            }
        )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
@@ -970,8 +986,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
        expected_outputs = Expectations(
            {
                ("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
-                ("cuda", 7): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
-                ("cuda", 8): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
+                ("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their',
+                ("cuda", 8): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their',
            }
        )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
@@ -986,8 +1002,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
        expected_outputs = Expectations(
            {
                ("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
-                ("cuda", 7): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
-                ("cuda", 8): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
+                ("cuda", 7): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
+                ("cuda", 8): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
            }
        )  # fmt: skip
        expected_output = expected_outputs.get_expectation()