[chat-template] Unify tests and clean up 🧼 (#37275)

* fix tests and some clean up * make one general test for each modality * remove redundant merging of kwargs * edge cases * dont enforce slow when reloading * fix gemma3 tests * has to adapt llama 4 after rebase * remove also from overriden tests * should be green now
2025-04-10 14:42:32 +02:00
parent 10144ff116
commit 1ae8d54b04
18 changed files with 389 additions and 1112 deletions
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -22,6 +22,7 @@ from typing import Optional

 import numpy as np
 from huggingface_hub import hf_hub_download
+from parameterized import parameterized

 from transformers.models.auto.processing_auto import processor_class_from_name
 from transformers.processing_utils import Unpack
@@ -44,6 +45,22 @@ if is_torch_available():
    import torch


+MODALITY_INPUT_DATA = {
+    "images": [
+        "http://images.cocodataset.org/val2017/000000039769.jpg",
+        "http://images.cocodataset.org/val2017/000000039769.jpg",
+    ],
+    "videos": [
+        "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
+        ["https://www.ilankelman.org/stopsigns/australia.jpg", "https://www.ilankelman.org/stopsigns/australia.jpg"],
+    ],
+    "audio": [
+        "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3",
+        "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav",
+    ],
+}
+
+
 def prepare_image_inputs():
    """This function prepares a list of PIL images"""
    image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
@@ -729,7 +746,7 @@ class ProcessorTesterMixin:
            )

    def test_chat_template_save_loading(self):
-        processor = self.get_processor()
+        processor = self.processor_class.from_pretrained(self.tmpdirname)
        signature = inspect.signature(processor.__init__)
        if "chat_template" not in {*signature.parameters.keys()}:
            self.skipTest("Processor doesn't accept chat templates at input")
@@ -756,210 +773,133 @@ class ProcessorTesterMixin:
            # the reloaded tokenizer should get the chat template as well
            self.assertEqual(reloaded_processor.chat_template, reloaded_processor.tokenizer.chat_template)

-    def test_image_chat_template_single(self):
+    @require_torch
+    def _test_apply_chat_template(
+        self,
+        modality: str,
+        batch_size: int,
+        return_tensors: str,
+        input_name: str,
+        processor_name: str,
+        input_data: list[str],
+    ):
        processor = self.get_processor()
        if processor.chat_template is None:
            self.skipTest("Processor has no chat template")

-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        if processor_name not in self.processor_class.attributes:
+            self.skipTest(f"{processor_name} attribute not present in {self.processor_class}")

-        messages = [
+        # some models have only Fast image processor
+        if getattr(processor, processor_name).__class__.__name__.endswith("Fast"):
+            return_tensors = "pt"
+
+        batch_messages = [
            [
                {
                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": "What is shown in this image?"},
-                    ],
+                    "content": [{"type": "text", "text": "Describe this."}],
                },
            ]
-        ]
+        ] * batch_size

-        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-        self.assertEqual(len(formatted_prompt), 1)
+        # Test that jinja can be applied
+        formatted_prompt = processor.apply_chat_template(batch_messages, add_generation_prompt=True, tokenize=False)
+        self.assertEqual(len(formatted_prompt), batch_size)

+        # Test that tokenizing with template and directly with `self.tokenizer` gives same output
        formatted_prompt_tokenized = processor.apply_chat_template(
-            messages, add_generation_prompt=True, tokenize=True, return_tensors=None
+            batch_messages, add_generation_prompt=True, tokenize=True, return_tensors=return_tensors
        )
        add_special_tokens = True
        if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
            add_special_tokens = False
-        expected_output = processor.tokenizer(
-            formatted_prompt, return_tensors=None, add_special_tokens=add_special_tokens
-        ).input_ids
-        self.assertListEqual(expected_output, formatted_prompt_tokenized)
-
-        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
-        self.assertTrue(all(key in out_dict for key in ["input_ids", "attention_mask"]))
-
-        # Now test the ability to return dict
-        messages[0][0]["content"].append(
-            {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
+        tok_output = processor.tokenizer(
+            formatted_prompt, return_tensors=return_tensors, add_special_tokens=add_special_tokens
        )
-        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
-        self.assertTrue(self.images_input_name in out_dict)
+        expected_output = tok_output.input_ids
+        self.assertListEqual(expected_output.tolist(), formatted_prompt_tokenized.tolist())

-        # should always have input_ids and attention_mask
-        self.assertEqual(len(out_dict["input_ids"]), 1)
-        self.assertEqual(len(out_dict["attention_mask"]), 1)
-        self.assertEqual(len(out_dict[self.images_input_name]), 1)
-
-    def test_image_chat_template_batched(self):
-        processor = self.get_processor()
-        if processor.chat_template is None:
-            self.skipTest("Processor has no chat template")
-
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-
-        batched_messages = [
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": "What is shown in this image?"},
-                    ],
-                },
-            ],
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": "What do you see?"},
-                    ],
-                },
-            ],
-        ]
-
-        formatted_prompt = processor.apply_chat_template(batched_messages, add_generation_prompt=True, tokenize=False)
-        self.assertEqual(len(formatted_prompt), 2)
-
-        formatted_prompt_tokenized = processor.apply_chat_template(
-            batched_messages, add_generation_prompt=True, tokenize=True, padding=True, return_tensors=None
-        )
-        add_special_tokens = True
-        if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
-            add_special_tokens = False
-        expected_output = processor.tokenizer(
-            formatted_prompt,
-            return_tensors=None,
-            padding=True,
-            add_special_tokens=add_special_tokens,
-        ).input_ids
-        self.assertListEqual(expected_output, formatted_prompt_tokenized)
-
-        out_dict = processor.apply_chat_template(
-            batched_messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            padding=True,
-        )
-        self.assertTrue(all(key in out_dict for key in ["input_ids", "attention_mask"]))
-
-        # Now test the ability to return dict
-        batched_messages[0][0]["content"].append(
-            {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
-        )
-        batched_messages[1][0]["content"].append(
-            {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"}
-        )
-        out_dict = processor.apply_chat_template(
-            batched_messages, add_generation_prompt=True, tokenize=True, return_dict=True, padding=True
-        )
-        self.assertTrue(self.images_input_name in out_dict)
-
-        # should always have input_ids and attention_mask
-        self.assertEqual(len(out_dict["input_ids"]), 2)
-        self.assertEqual(len(out_dict["attention_mask"]), 2)
-        self.assertEqual(len(out_dict[self.images_input_name]), 2)
-
-    def test_image_chat_template_accepts_processing_kwargs(self):
-        processor = self.get_processor()
-        if processor.chat_template is None:
-            self.skipTest("Processor has no chat template")
-
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-
-        messages = [
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": "What is shown in this image?"},
-                    ],
-                },
-            ]
-        ]
-
-        formatted_prompt_tokenized = processor.apply_chat_template(
-            messages,
+        # Test that kwargs passed to processor's `__call__` are actually used
+        tokenized_prompt_100 = processor.apply_chat_template(
+            batch_messages,
            add_generation_prompt=True,
            tokenize=True,
            padding="max_length",
            truncation=True,
-            max_length=50,
+            return_tensors=return_tensors,
+            max_length=100,
        )
-        self.assertEqual(len(formatted_prompt_tokenized[0]), 50)
+        self.assertEqual(len(tokenized_prompt_100[0]), 100)

-        formatted_prompt_tokenized = processor.apply_chat_template(
-            messages,
+        # Test that `return_dict=True` returns text related inputs in the dict
+        out_dict_text = processor.apply_chat_template(
+            batch_messages,
            add_generation_prompt=True,
            tokenize=True,
-            truncation=True,
-            max_length=5,
+            return_dict=True,
+            return_tensors=return_tensors,
        )
-        self.assertEqual(len(formatted_prompt_tokenized[0]), 5)
+        self.assertTrue(all(key in out_dict_text for key in ["input_ids", "attention_mask"]))
+        self.assertEqual(len(out_dict_text["input_ids"]), batch_size)
+        self.assertEqual(len(out_dict_text["attention_mask"]), batch_size)
+
+        # Test that with modality URLs and `return_dict=True`, we get modality inputs in the dict
+        for idx, url in enumerate(input_data[:batch_size]):
+            batch_messages[idx][0]["content"] = [batch_messages[idx][0]["content"][0], {"type": modality, "url": url}]

-        # Now test the ability to return dict
-        messages[0][0]["content"].append(
-            {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
-        )
        out_dict = processor.apply_chat_template(
-            messages,
+            batch_messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
-            do_rescale=True,
-            rescale_factor=-1,
-            return_tensors="np",
+            return_tensors=return_tensors,
+            num_frames=4,  # by default no more than 4 frames, otherwise too slow
        )
-        self.assertLessEqual(out_dict[self.images_input_name][0][0].mean(), 0)
+        input_name = getattr(self, input_name)
+        self.assertTrue(input_name in out_dict)
+        self.assertEqual(len(out_dict["input_ids"]), batch_size)
+        self.assertEqual(len(out_dict["attention_mask"]), batch_size)
+        self.assertEqual(len(out_dict[input_name]), batch_size)

-    @require_torch
-    def test_image_chat_template_dict_torch(self):
-        processor = self.get_processor()
-        if processor.chat_template is None:
-            self.skipTest("Processor has no chat template")
+        return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
+        for k in out_dict:
+            self.assertIsInstance(out_dict[k], return_tensor_to_type[return_tensors])

-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
-                    {"type": "text", "text": "What is shown in this image?"},
-                ],
-            },
-        ]
-
-        out_dict_tensors = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        )
-        self.assertTrue(self.images_input_name in out_dict_tensors)
-        for k in out_dict_tensors:
-            self.assertIsInstance(out_dict_tensors[k], torch.Tensor)
+        # Test continue from final message
+        assistant_message = {
+            "role": "assistant",
+            "content": [{"type": "text", "text": "It is the sound of"}],
+        }
+        for idx, url in enumerate(input_data[:batch_size]):
+            batch_messages[idx] = batch_messages[idx] + [assistant_message]
+        continue_prompt = processor.apply_chat_template(batch_messages, continue_final_message=True, tokenize=False)
+        for prompt in continue_prompt:
+            self.assertTrue(prompt.endswith("It is the sound of"))  # no `eos` token at the end

    @require_av
-    def test_chat_template_video(self):
+    @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")])
+    def test_apply_chat_template_audio(self, batch_size: int, return_tensors: str):
+        self._test_apply_chat_template(
+            "audio", batch_size, return_tensors, "audio_input_name", "feature_extracttor", MODALITY_INPUT_DATA["audio"]
+        )
+
+    @require_librosa
+    @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")])
+    def test_apply_chat_template_video(self, batch_size: int, return_tensors: str):
+        self._test_apply_chat_template(
+            "video", batch_size, return_tensors, "videos_input_name", "video_processor", MODALITY_INPUT_DATA["videos"]
+        )
+
+    @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")])
+    def test_apply_chat_template_image(self, batch_size: int, return_tensors: str):
+        self._test_apply_chat_template(
+            "image", batch_size, return_tensors, "images_input_name", "image_processor", MODALITY_INPUT_DATA["images"]
+        )
+
+    def test_apply_chat_template_video_frame_sampling(self):
        processor = self.get_processor()
+
        if processor.chat_template is None:
            self.skipTest("Processor has no chat template")

@@ -975,37 +915,16 @@ class ProcessorTesterMixin:
                {
                    "role": "user",
                    "content": [
-                        {"type": "video"},
+                        {
+                            "type": "video",
+                            "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
+                        },
                        {"type": "text", "text": "What is shown in this video?"},
                    ],
                },
            ]
        ]

-        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-        self.assertEqual(len(formatted_prompt), 1)
-
-        formatted_prompt_tokenized = processor.apply_chat_template(
-            messages, add_generation_prompt=True, tokenize=True, return_tensors=None
-        )
-        add_special_tokens = True
-        if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
-            add_special_tokens = False
-        expected_output = processor.tokenizer(
-            formatted_prompt,
-            return_tensors=None,
-            add_special_tokens=add_special_tokens,
-        ).input_ids
-        self.assertListEqual(expected_output, formatted_prompt_tokenized)
-
-        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
-        self.assertTrue(all(key in out_dict for key in ["input_ids", "attention_mask"]))
-
-        # Add video URL for return dict and load with `num_frames` arg
-        messages[0][0]["content"][0] = {
-            "type": "video",
-            "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
-        }
        num_frames = 3
        out_dict_with_video = processor.apply_chat_template(
            messages,
@@ -1013,6 +932,7 @@ class ProcessorTesterMixin:
            tokenize=True,
            return_dict=True,
            num_frames=num_frames,
+            return_tensors="np",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
@@ -1026,6 +946,7 @@ class ProcessorTesterMixin:
            tokenize=True,
            return_dict=True,
            video_fps=video_fps,
+            return_tensors="np",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
@@ -1073,53 +994,7 @@ class ProcessorTesterMixin:
        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 2)

    @require_av
-    def test_chat_template_video_custom_sampling(self):
-        """
-        Tests that models can pass their custom callables to sample video indices.
-        """
-        processor = self.get_processor()
-        if processor.chat_template is None:
-            self.skipTest("Processor has no chat template")
-
-        signature = inspect.signature(processor.__call__)
-        if "videos" not in {*signature.parameters.keys()} or (
-            signature.parameters.get("videos") is not None
-            and signature.parameters["videos"].annotation == inspect._empty
-        ):
-            self.skipTest("Processor doesn't accept videos at input")
-
-        video_file_path = hf_hub_download(
-            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
-        )
-        messages = [
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "video", "path": video_file_path},
-                        {"type": "text", "text": "What is shown in this video?"},
-                    ],
-                },
-            ]
-        ]
-
-        def dummy_sample_indices_fn(metadata, **fn_kwargs):
-            # sample only the first two frame always
-            return [0, 1]
-
-        out_dict_with_video = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            sample_indices_fn=dummy_sample_indices_fn,
-        )
-        self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 2)
-
-    @require_av
-    def test_chat_template_video_special_processing(self):
+    def test_apply_chat_template_video_special_processing(self):
        """
        Tests that models can use their own preprocessing to preprocess conversations.
        """
@@ -1176,6 +1051,7 @@ class ProcessorTesterMixin:
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
+            return_tensors="np",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)

@@ -1187,7 +1063,7 @@ class ProcessorTesterMixin:

    @require_librosa
    @require_av
-    def test_audio_chat_template_from_video(self):
+    def test_chat_template_audio_from_video(self):
        processor = self.get_processor()
        if processor.chat_template is None:
            self.skipTest("Processor has no chat template")
@@ -1241,124 +1117,10 @@ class ProcessorTesterMixin:
            load_audio_from_video=True,
        )
        self.assertTrue(self.audio_input_name in out_dict)
-        self.assertTrue(self.video_input_name in out_dict)
+        self.assertTrue(self.videos_input_name in out_dict)

        # should always have input_ids and attention_mask
        self.assertEqual(len(out_dict["input_ids"]), 1)  # batch-size=1
        self.assertEqual(len(out_dict["attention_mask"]), 1)  # batch-size=1
        self.assertEqual(len(out_dict[self.audio_input_name]), 2)  # 2 audios in the conversation
-        self.assertEqual(len(out_dict[self.video_input_name]), 1)  # 1 video in the conversation
-
-    @require_librosa
-    def test_audio_chat_template_single(self):
-        processor = self.get_processor()
-        if processor.chat_template is None:
-            self.skipTest("Processor has no chat template")
-
-        if "feature_extractor" not in self.processor_class.attributes:
-            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
-
-        messages = [
-            {
-                "role": "system",
-                "content": [{"type": "text", "text": "You are a helpful assistant."}],
-            },
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "audio",
-                    },
-                    {"type": "text", "text": "What's that sound?"},
-                ],
-            },
-            {
-                "role": "assistant",
-                "content": [{"type": "text", "text": "It is the sound of glass shattering."}],
-            },
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "audio",
-                    },
-                    {"type": "text", "text": "How about this one?"},
-                ],
-            },
-        ]
-
-        formatted_prompt = processor.apply_chat_template([messages], add_generation_prompt=True, tokenize=False)
-        self.assertEqual(len(formatted_prompt), 1)  # batch size=1
-
-        formatted_prompt_tokenized = processor.apply_chat_template(
-            messages, add_generation_prompt=True, tokenize=True, return_tensors=None
-        )
-        expected_output = processor.tokenizer(formatted_prompt, return_tensors=None).input_ids
-        self.assertListEqual(expected_output, formatted_prompt_tokenized)
-
-        messages[1]["content"][0]["audio"] = (
-            "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"
-        )
-        messages[3]["content"][0]["audio"] = (
-            "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"
-        )
-        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
-        self.assertTrue(self.audio_input_name in out_dict)
-
-        # should always have input_ids and attention_mask
-        self.assertEqual(len(out_dict["input_ids"]), 1)  # batch-size=1
-        self.assertEqual(len(out_dict["attention_mask"]), 1)  # batch-size=1
-        self.assertEqual(len(out_dict[self.audio_input_name]), 2)  # 2 audios in the conversation
-
-    @require_torch
-    @require_librosa
-    def test_audio_chat_template_dict_torch(self):
-        processor = self.get_processor()
-        if processor.chat_template is None:
-            self.skipTest("Processor has no chat template")
-
-        if "feature_extractor" not in self.processor_class.attributes:
-            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
-
-        messages = [
-            {
-                "role": "system",
-                "content": [{"type": "text", "text": "You are a helpful assistant."}],
-            },
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "audio",
-                        "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3",
-                    },
-                    {"type": "text", "text": "What's that sound?"},
-                ],
-            },
-            {
-                "role": "assistant",
-                "content": [{"type": "text", "text": "It is the sound of glass shattering."}],
-            },
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "audio",
-                        "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav",
-                    },
-                    {"type": "text", "text": "How about this one?"},
-                ],
-            },
-        ]
-
-        out_dict_tensors = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        )
-
-        self.assertTrue(self.audio_input_name in out_dict_tensors)
-        for k in out_dict_tensors:
-            self.assertIsInstance(out_dict_tensors[k], torch.Tensor)
+        self.assertEqual(len(out_dict[self.videos_input_name]), 1)  # 1 video in the conversation