From 1ae8d54b0456867c83e28ecb1fde2e1dcd480e60 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Thu, 10 Apr 2025 14:42:32 +0200 Subject: [PATCH] =?UTF-8?q?[chat-template]=20Unify=20tests=20and=20clean?= =?UTF-8?q?=20up=20=F0=9F=A7=BC=20=20(#37275)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix tests and some clean up * make one general test for each modality * remove redundant merging of kwargs * edge cases * dont enforce slow when reloading * fix gemma3 tests * has to adapt llama 4 after rebase * remove also from overriden tests * should be green now --- docs/source/en/chat_templating_multimodal.md | 29 -- .../models/smolvlm/processing_smolvlm.py | 53 +- src/transformers/processing_utils.py | 94 +++- tests/models/aria/test_processor_aria.py | 49 -- .../aya_vision/test_processor_aya_vision.py | 5 - tests/models/llava/test_processor_llava.py | 64 --- .../llava_next/test_processor_llava_next.py | 17 - .../test_processor_llava_next_video.py | 80 +-- .../test_processor_llava_onevision.py | 49 +- .../mistral3/test_processor_mistral3.py | 71 --- .../models/pixtral/test_processor_pixtral.py | 16 - .../qwen2_5_vl/test_processor_qwen2_5_vl.py | 202 +++----- .../qwen2_audio/test_processor_qwen2_audio.py | 28 +- .../qwen2_vl/test_processor_qwen2_vl.py | 202 +++----- .../test_processing_shieldgemma2.py | 19 +- .../models/smolvlm/test_processor_smolvlm.py | 10 +- .../vipllava/test_processor_vipllava.py | 41 -- tests/test_processing_common.py | 472 +++++------------- 18 files changed, 389 insertions(+), 1112 deletions(-) delete mode 100644 tests/models/vipllava/test_processor_vipllava.py diff --git a/docs/source/en/chat_templating_multimodal.md b/docs/source/en/chat_templating_multimodal.md index 6319f00b97..3a01f652aa 100644 --- a/docs/source/en/chat_templating_multimodal.md +++ b/docs/source/en/chat_templating_multimodal.md @@ -181,35 +181,6 @@ processed_chat = processor.apply_chat_template( print(processed_chat.keys()) ``` - - - -Some models don't sample frames *uniformly* and require more complex logic to determine which frames to use. For example, the model may have an *adaptive frame selection* or if the model prioritizes *key moments* in a video rather than evenly spaced frames. - -If a model has a different sampling strategy, you can write a function that customizes frame selection. The function should include the following requirements. - -- Use the `sample_indices_fn` parameter to pass a callable function for sampling. -- If provided, this function *overrides* the standard `num_frames` and `fps` parameters. -- The function receives all the parameters passed to `load_video` and must return valid frame indices to sample from. - -An example function is shown below. This gives you full control over frame selection, making the model more adaptable to different video scenarios. - -```py -def sample_indices_fn(metadata, **kwargs): - # samples only the first and the second frame - return [0, 1] - -processed_chat = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_dict=True, - sample_indices_fn=sample_indices_fn, - video_load_backend="decord", -) -print(processed_chat.keys()) -``` - diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py index ddc72894af..4cf207c459 100644 --- a/src/transformers/models/smolvlm/processing_smolvlm.py +++ b/src/transformers/models/smolvlm/processing_smolvlm.py @@ -20,10 +20,13 @@ import copy from datetime import timedelta from typing import TYPE_CHECKING, Dict, List, Optional, Union +import numpy as np + from ...feature_extraction_utils import BatchFeature from ...image_utils import ( ImageInput, VideoInput, + load_video, make_batched_videos, make_nested_list_of_images, ) @@ -425,32 +428,44 @@ class SmolVLMProcessor(ProcessorMixin): image_processor_input_names = self.image_processor.model_input_names return list(dict.fromkeys(image_processor_input_names + tokenizer_input_names)) - # Add model-specific video sampling method when applying the template - def apply_chat_template( + # TODO: raushan, has to be public method under `VideoProcessorBase` when API is added + def _load_video_for_model( self, - conversation, - max_frames=None, - target_fps=None, - skip_secs=1, - video_load_backend="pyav", - sample_indices_fn=None, - **kwargs, - ): - max_frames = self.default_max_frames if max_frames is None else max_frames - target_fps = self.default_fps if target_fps is None else target_fps + video: Union[str, "VideoInput"], + num_frames: Optional[int] = None, + fps: Optional[int] = None, + backend: str = "opencv", + skip_secs: int = 0.0, + ) -> np.array: + """ + Loads `video` to a numpy array. + + Args: + video (`str` or `VideoInput`): + The video to convert to the numpy array format. Can be a link to video or local path. + num_frames (`int`, *optional*): + Number of frames to sample uniformly. If not passed, the whole video is loaded. + fps (`int`, *optional*): + Number of frames to sample per second. Should be passed only when `num_frames=None`. + If not specified and `num_frames==None`, all frames are sampled. + backend (`str`, *optional*, defaults to `"opencv"`): + The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "opencv". + + Returns: + Tuple[`np.array`, Dict]: A tuple containing: + - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]). + - Metadata dictionary. + """ + max_frames = self.default_max_frames if num_frames is None else num_frames + target_fps = self.default_fps if fps is None else fps def sample_indices_fn_func(metadata, **fn_kwargs): return smolvlm_sample_indices_fn( metadata, max_frames=max_frames, target_fps=target_fps, skip_secs=skip_secs, **fn_kwargs ) - # word of caution- we are blindly overriding a callable kwarg here. - # typed kwargs would be a way to avoid that @molbap - if not sample_indices_fn: - sample_indices_fn = sample_indices_fn_func - return super().apply_chat_template( - conversation, video_load_backend=video_load_backend, sample_indices_fn=sample_indices_fn, **kwargs - ) + video, metadata = load_video(video, backend=backend, sample_indices_fn=sample_indices_fn_func) + return video, metadata __all__ = ["SmolVLMProcessor"] diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index b1c40e7ff2..d63eab7938 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -23,7 +23,7 @@ import sys import typing import warnings from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, TypedDict, Union +from typing import Any, Dict, List, Optional, TypedDict, Union import numpy as np import typing_extensions @@ -415,7 +415,6 @@ class ChatTemplateLoadKwargs(TypedDict, total=False): video_load_backend: Optional[str] = "pyav" video_fps: Optional[int] = None sampling_rate: Optional[int] = 16_000 - sample_indices_fn: Optional[Callable] = None load_audio_from_video: Optional[bool] = False @@ -435,7 +434,16 @@ class ProcessorChatTemplateKwargs(ChatTemplateLoadKwargs, TokenizerChatTemplateK class AllKwargsForChatTemplate( TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, CommonKwargs, ProcessorChatTemplateKwargs -): ... +): + processor_kwargs: ProcessingKwargs = { + **ProcessingKwargs.__annotations__, + } + mm_load_kwargs: ChatTemplateLoadKwargs = { + **TextKwargs.__annotations__, + } + template_kwargs: ProcessorChatTemplateKwargs = { + **ProcessorChatTemplateKwargs.__annotations__, + } class ProcessorMixin(PushToHubMixin): @@ -1315,19 +1323,20 @@ class ProcessorMixin(PushToHubMixin): "https://huggingface.co/docs/transformers/main/en/chat_templating for more information." ) - # Fill two sets of kwargs that should be used by tokenizer's `apply_chat_template` - # and for multimodal data loading. Everything else will be used in `__call__` - tokenizer_template_kwargs = {} - for tokenizer_key in TokenizerChatTemplateKwargs.__annotations__.keys(): - default_value = getattr(TokenizerChatTemplateKwargs, tokenizer_key, None) - value = kwargs.pop(tokenizer_key, default_value) - tokenizer_template_kwargs[tokenizer_key] = value + # Fill sets of kwargs that should be used by different parts of template + processed_kwargs = { + "processor_kwargs": {}, + "mm_load_kwargs": {}, + "template_kwargs": {}, + } - mm_load_kwargs = {} - for mm_load_key in ChatTemplateLoadKwargs.__annotations__.keys(): - default_value = getattr(ChatTemplateLoadKwargs, mm_load_key, None) - value = kwargs.pop(mm_load_key, default_value) - mm_load_kwargs[mm_load_key] = value + for kwarg_type in processed_kwargs: + for key in AllKwargsForChatTemplate.__annotations__[kwarg_type].__annotations__.keys(): + kwarg_type_defaults = AllKwargsForChatTemplate.__annotations__[kwarg_type] + default_value = getattr(kwarg_type_defaults, key, None) + value = kwargs.pop(key, default_value) + if value is not None and not isinstance(value, dict): + processed_kwargs[kwarg_type][key] = value if isinstance(conversation, (list, tuple)) and ( isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content") @@ -1338,8 +1347,9 @@ class ProcessorMixin(PushToHubMixin): is_batched = False conversations = [conversation] - tokenize = kwargs.pop("tokenize", False) - return_dict = kwargs.pop("return_dict", False) + tokenize = processed_kwargs["template_kwargs"].pop("tokenize", False) + return_dict = processed_kwargs["template_kwargs"].pop("return_dict", False) + mm_load_kwargs = processed_kwargs["mm_load_kwargs"] if tokenize: batch_images, batch_videos = [], [] @@ -1382,7 +1392,7 @@ class ProcessorMixin(PushToHubMixin): for fname in video_fnames: if isinstance(fname, (list, tuple)) and isinstance(fname[0], str): - video = [np.array(load_image(image_fname)).T for image_fname in fname] + video = [np.array(load_image(image_fname)) for image_fname in fname] # create a 4D video because `load_video` always returns a 4D array video = np.stack(video) metadata = None @@ -1391,12 +1401,13 @@ class ProcessorMixin(PushToHubMixin): "If your model uses this metadata during processing, please load the whole video and let the model sample frames instead." ) else: - video, metadata = load_video( + # TODO: raushan, should be `self.video_processor.load_video_for_model` when API is added + video, metadata = self._load_video_for_model( fname, - num_frames=mm_load_kwargs["num_frames"], - fps=mm_load_kwargs["video_fps"], + num_frames=mm_load_kwargs.get("num_frames", None), + fps=mm_load_kwargs.get("video_fps", None), backend=mm_load_kwargs["video_load_backend"], - sample_indices_fn=mm_load_kwargs["sample_indices_fn"], + **kwargs, ) videos.append(video) video_metadata.append(metadata) @@ -1415,7 +1426,7 @@ class ProcessorMixin(PushToHubMixin): batch_images=batch_images, batch_videos=batch_videos, batch_video_metadata=batch_video_metadata, - **mm_load_kwargs, + **processed_kwargs["mm_load_kwargs"], ) prompt = self.tokenizer.apply_chat_template( @@ -1423,7 +1434,7 @@ class ProcessorMixin(PushToHubMixin): chat_template=chat_template, tokenize=False, return_dict=False, - **tokenizer_template_kwargs, + **processed_kwargs["template_kwargs"], ) if not is_batched: @@ -1438,14 +1449,14 @@ class ProcessorMixin(PushToHubMixin): # without actionable solution for users single_prompt = prompt[0] if is_batched else prompt if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token): - kwargs["add_special_tokens"] = False + processed_kwargs["processor_kwargs"]["add_special_tokens"] = False out = self( text=prompt, images=batch_images if batch_images else None, videos=batch_videos if batch_videos else None, audio=batch_audios if batch_audios else None, - **kwargs, + **processed_kwargs["processor_kwargs"], ) if return_dict: return out @@ -1453,6 +1464,37 @@ class ProcessorMixin(PushToHubMixin): return out["input_ids"] return prompt + # TODO: raushan, has to be public method under `VideoProcessorBase` when API is added + # Keep private so we can simply remove when needed + def _load_video_for_model( + self, + video: Union[str, "VideoInput"], + num_frames: Optional[int] = None, + fps: Optional[int] = None, + backend: str = "opencv", + ) -> np.array: + """ + Loads `video` to a numpy array. + + Args: + video (`str` or `VideoInput`): + The video to convert to the numpy array format. Can be a link to video or local path. + num_frames (`int`, *optional*): + Number of frames to sample uniformly. If not passed, the whole video is loaded. + fps (`int`, *optional*): + Number of frames to sample per second. Should be passed only when `num_frames=None`. + If not specified and `num_frames==None`, all frames are sampled. + backend (`str`, *optional*, defaults to `"opencv"`): + The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "opencv". + + Returns: + Tuple[`np.array`, Dict]: A tuple containing: + - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]). + - Metadata dictionary. + """ + video, metadata = load_video(video, num_frames, fps=fps, backend=backend) + return video, metadata + def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=True, **kwargs): """ Post-process the output of a vlm to decode the text. diff --git a/tests/models/aria/test_processor_aria.py b/tests/models/aria/test_processor_aria.py index 836563e109..ac222e1505 100644 --- a/tests/models/aria/test_processor_aria.py +++ b/tests/models/aria/test_processor_aria.py @@ -236,55 +236,6 @@ And who is that?<|im_end|> """ self.assertEqual(rendered, expected_rendered) - # Override as AriaImageProcessor doesn't accept `do_rescale` - def test_image_chat_template_accepts_processing_kwargs(self): - processor = self.get_processor() - if processor.chat_template is None: - self.skipTest("Processor has no chat template") - - messages = [ - [ - { - "role": "user", - "content": [ - {"type": "text", "text": "What is shown in this image?"}, - ], - }, - ] - ] - - formatted_prompt_tokenized = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - padding="max_length", - max_length=50, - ) - self.assertEqual(len(formatted_prompt_tokenized[0]), 50) - - formatted_prompt_tokenized = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - truncation=True, - max_length=5, - ) - self.assertEqual(len(formatted_prompt_tokenized[0]), 5) - - # Now test the ability to return dict - messages[0][0]["content"].append( - {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"} - ) - out_dict = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_dict=True, - max_image_size=980, - return_tensors="np", - ) - self.assertListEqual(list(out_dict[self.images_input_name].shape), [1, 3, 980, 980]) - # Override as AriaProcessor needs image tokens in prompts def prepare_text_inputs(self, batch_size: Optional[int] = None): if batch_size is None: diff --git a/tests/models/aya_vision/test_processor_aya_vision.py b/tests/models/aya_vision/test_processor_aya_vision.py index 7e6e080b71..527f83c0bb 100644 --- a/tests/models/aya_vision/test_processor_aya_vision.py +++ b/tests/models/aya_vision/test_processor_aya_vision.py @@ -79,11 +79,6 @@ class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase): def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) - # todo: yoni, fix this test - @unittest.skip("Chat template has long system prompt") - def test_chat_template_accepts_processing_kwargs(self, **kwargs): - pass - # Override as AyaVisionProcessor needs image tokens in prompts def prepare_text_inputs(self, batch_size: Optional[int] = None): if batch_size is None: diff --git a/tests/models/llava/test_processor_llava.py b/tests/models/llava/test_processor_llava.py index 56c854f4a4..637afebeb2 100644 --- a/tests/models/llava/test_processor_llava.py +++ b/tests/models/llava/test_processor_llava.py @@ -86,67 +86,3 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor = LlavaProcessor.from_pretrained(checkpoint) tokenizer = AutoTokenizer.from_pretrained(checkpoint) self.assertEqual(processor.tokenizer.__class__, tokenizer.__class__) - - def test_chat_template(self): - processor = LlavaProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") - expected_prompt = "USER: \nWhat is shown in this image? ASSISTANT:" - - messages = [ - { - "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": "What is shown in this image?"}, - ], - }, - ] - - formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True) - self.assertEqual(expected_prompt, formatted_prompt) - - def test_chat_template_dict(self): - processor = LlavaProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") - messages = [ - { - "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": "What is shown in this image?"}, - ], - }, - ] - - formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) - expected_output = [[1, 3148, 1001, 29901, 29871, 32000, 29871, 13, 5618, 338, 4318, 297, 445, 1967, 29973, 319, 1799, 9047, 13566, 29901]] # fmt: skip - self.assertListEqual(expected_output, formatted_prompt_tokenized) - - out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True) - self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"]) - - # add image URL for return dict - messages[0]["content"][0] = {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"} - out_dict_with_image = processor.apply_chat_template( - messages, add_generation_prompt=True, tokenize=True, return_dict=True - ) - self.assertListEqual(list(out_dict_with_image.keys()), ["input_ids", "attention_mask", "pixel_values"]) - - def test_chat_template_with_continue_final_message(self): - processor = LlavaProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") - expected_prompt = "USER: \nDescribe this image. ASSISTANT: There is a dog and" - messages = [ - { - "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": "Describe this image."}, - ], - }, - { - "role": "assistant", - "content": [ - {"type": "text", "text": "There is a dog and"}, - ], - }, - ] - prompt = processor.apply_chat_template(messages, continue_final_message=True) - self.assertEqual(expected_prompt, prompt) diff --git a/tests/models/llava_next/test_processor_llava_next.py b/tests/models/llava_next/test_processor_llava_next.py index 4a14f5ab77..e8860a4335 100644 --- a/tests/models/llava_next/test_processor_llava_next.py +++ b/tests/models/llava_next/test_processor_llava_next.py @@ -78,23 +78,6 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_dict = self.prepare_processor_dict() self.assertTrue(processor_loaded.chat_template == processor_dict.get("chat_template", None)) - def test_chat_template(self): - processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf") - expected_prompt = "USER: \nWhat is shown in this image? ASSISTANT:" - - messages = [ - { - "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": "What is shown in this image?"}, - ], - }, - ] - - formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True) - self.assertEqual(expected_prompt, formatted_prompt) - def test_image_token_filling(self): processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf") processor.patch_size = 14 diff --git a/tests/models/llava_next_video/test_processor_llava_next_video.py b/tests/models/llava_next_video/test_processor_llava_next_video.py index affa95cc04..f74bbab01a 100644 --- a/tests/models/llava_next_video/test_processor_llava_next_video.py +++ b/tests/models/llava_next_video/test_processor_llava_next_video.py @@ -18,7 +18,7 @@ import tempfile import unittest from transformers import AutoProcessor, LlamaTokenizerFast, LlavaNextVideoProcessor -from transformers.testing_utils import require_av, require_torch, require_vision +from transformers.testing_utils import require_vision from transformers.utils import is_torch_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin @@ -28,7 +28,7 @@ if is_vision_available(): from transformers import LlavaNextImageProcessor, LlavaNextVideoImageProcessor if is_torch_available: - import torch + pass @require_vision @@ -90,79 +90,3 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase): @classmethod def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) - - def test_chat_template(self): - processor = AutoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf") - expected_prompt = "USER: \nWhat is shown in this image? ASSISTANT:" - - messages = [ - { - "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": "What is shown in this image?"}, - ], - }, - ] - - formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True) - self.assertEqual(expected_prompt, formatted_prompt) - - @require_av - def test_chat_template_dict(self): - processor = AutoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf") - messages = [ - { - "role": "user", - "content": [ - {"type": "video"}, - {"type": "text", "text": "What is shown in this video?"}, - ], - }, - ] - - formatted_prompt_tokenized = processor.apply_chat_template( - messages, add_generation_prompt=True, tokenize=True, return_tensors=None - ) - expected_output = [[1, 3148, 1001, 29901, 29871, 32000, 13, 5618, 338, 4318, 297, 445, 4863, 29973, 319, 1799, 9047, 13566, 29901]] # fmt: skip - self.assertListEqual(expected_output, formatted_prompt_tokenized) - - out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True) - self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"]) - - # add image URL for return dict - messages[0]["content"][0] = { - "type": "video", - "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4", - } - out_dict_with_video = processor.apply_chat_template( - messages, add_generation_prompt=True, tokenize=True, return_dict=True - ) - self.assertListEqual(list(out_dict_with_video.keys()), ["input_ids", "attention_mask", "pixel_values_videos"]) - - @require_torch - @require_av - def test_chat_template_dict_torch(self): - processor = AutoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf") - messages = [ - { - "role": "user", - "content": [ - { - "type": "video", - "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4", - }, - {"type": "text", "text": "What is shown in this video?"}, - ], - }, - ] - - out_dict_tensors = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_dict=True, - return_tensors="pt", - ) - self.assertListEqual(list(out_dict_tensors.keys()), ["input_ids", "attention_mask", "pixel_values_videos"]) - self.assertTrue(isinstance(out_dict_tensors["input_ids"], torch.Tensor)) diff --git a/tests/models/llava_onevision/test_processor_llava_onevision.py b/tests/models/llava_onevision/test_processor_llava_onevision.py index b545e5b396..9764cd4dcd 100644 --- a/tests/models/llava_onevision/test_processor_llava_onevision.py +++ b/tests/models/llava_onevision/test_processor_llava_onevision.py @@ -16,7 +16,7 @@ import shutil import tempfile import unittest -from transformers.testing_utils import require_av, require_vision +from transformers.testing_utils import require_vision from transformers.utils import is_torch_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin @@ -93,50 +93,3 @@ class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase): # so we check if the same template is loaded processor_dict = self.prepare_processor_dict() self.assertTrue(processor_loaded.chat_template == processor_dict.get("chat_template", None)) - - def test_chat_template(self): - processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf") - expected_prompt = "<|im_start|>user \nWhat is shown in this image?<|im_end|><|im_start|>assistant\n" - - messages = [ - { - "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": "What is shown in this image?"}, - ], - }, - ] - - formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True) - self.assertEqual(expected_prompt, formatted_prompt) - - @require_av - def test_chat_template_dict(self): - processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf") - messages = [ - { - "role": "user", - "content": [ - {"type": "video"}, - {"type": "text", "text": "What is shown in this video?"}, - ], - }, - ] - - formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) - expected_output = [[151644, 872, 220, 151647, 198, 3838, 374, 6839, 304, 419, 2766, 30, 151645, 151644, 77091, 198]] # fmt: skip - self.assertListEqual(expected_output, formatted_prompt_tokenized) - - out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True) - self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"]) - - # add image URL for return dict - messages[0]["content"][0] = { - "type": "video", - "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4", - } - out_dict_with_video = processor.apply_chat_template( - messages, add_generation_prompt=True, tokenize=True, return_dict=True - ) - self.assertListEqual(list(out_dict_with_video.keys()), ["input_ids", "attention_mask", "pixel_values_videos"]) diff --git a/tests/models/mistral3/test_processor_mistral3.py b/tests/models/mistral3/test_processor_mistral3.py index fa29d15f86..8677eea426 100644 --- a/tests/models/mistral3/test_processor_mistral3.py +++ b/tests/models/mistral3/test_processor_mistral3.py @@ -62,77 +62,6 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): def tearDown(self): shutil.rmtree(self.tmpdirname) - def test_chat_template_accepts_processing_kwargs(self): - # override to use slow image processor to return numpy arrays - processor = self.processor_class.from_pretrained(self.tmpdirname, use_fast=False) - if processor.chat_template is None: - self.skipTest("Processor has no chat template") - - messages = [ - [ - { - "role": "user", - "content": [ - {"type": "text", "text": "What is shown in this image?"}, - ], - }, - ] - ] - - formatted_prompt_tokenized = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - padding="max_length", - truncation=True, - max_length=50, - ) - self.assertEqual(len(formatted_prompt_tokenized[0]), 50) - - formatted_prompt_tokenized = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - truncation=True, - max_length=5, - ) - self.assertEqual(len(formatted_prompt_tokenized[0]), 5) - - # Now test the ability to return dict - messages[0][0]["content"].append( - {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"} - ) - out_dict = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_dict=True, - do_rescale=True, - rescale_factor=-1, - return_tensors="np", - ) - self.assertLessEqual(out_dict[self.images_input_name][0][0].mean(), 0) - - def test_chat_template(self): - processor = self.processor_class.from_pretrained(self.tmpdirname, use_fast=False) - expected_prompt = "[SYSTEM_PROMPT][/SYSTEM_PROMPT][INST][IMG]What is shown in this image?[/INST]" - - messages = [ - { - "role": "system", - "content": "", - }, - { - "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": "What is shown in this image?"}, - ], - }, - ] - formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True) - self.assertEqual(expected_prompt, formatted_prompt) - def test_image_token_filling(self): processor = self.processor_class.from_pretrained(self.tmpdirname) # Important to check with non square image diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py index a678e7c010..3f66b98309 100644 --- a/tests/models/pixtral/test_processor_pixtral.py +++ b/tests/models/pixtral/test_processor_pixtral.py @@ -51,22 +51,6 @@ class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase): def tearDown(self): shutil.rmtree(self.tmpdirname) - def test_chat_template(self): - processor = self.processor_class.from_pretrained(self.tmpdirname) - expected_prompt = "[INST][IMG]What is shown in this image?[/INST]" - - messages = [ - { - "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": "What is shown in this image?"}, - ], - }, - ] - formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True) - self.assertEqual(expected_prompt, formatted_prompt) - def test_image_token_filling(self): processor = self.processor_class.from_pretrained(self.tmpdirname) # Important to check with non square image diff --git a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py index a7060cfc98..352456da6e 100644 --- a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py +++ b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py @@ -17,12 +17,13 @@ import shutil import tempfile import unittest +import numpy as np import pytest from huggingface_hub import hf_hub_download from transformers import AutoProcessor, Qwen2Tokenizer from transformers.testing_utils import require_av, require_torch, require_vision -from transformers.utils import is_vision_available +from transformers.utils import is_torch_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin @@ -30,6 +31,9 @@ from ...test_processing_common import ProcessorTesterMixin if is_vision_available(): from transformers import Qwen2_5_VLProcessor, Qwen2VLImageProcessor +if is_torch_available(): + import torch + @require_vision @require_torch @@ -119,101 +123,97 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase): self.assertListEqual(list(inputs.keys()), processor.model_input_names) - def test_image_chat_template_single(self): + @require_torch + def _test_apply_chat_template( + self, + modality: str, + batch_size: int, + return_tensors: str, + input_name: str, + processor_name: str, + input_data: list[str], + ): processor = self.get_processor() if processor.chat_template is None: self.skipTest("Processor has no chat template") - messages = [ + if processor_name not in self.processor_class.attributes: + self.skipTest(f"{processor_name} attribute not present in {self.processor_class}") + + batch_messages = [ [ { "role": "user", - "content": [ - {"type": "text", "text": "What is shown in this image?"}, - ], + "content": [{"type": "text", "text": "Describe this."}], }, ] - ] + ] * batch_size - formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) - self.assertEqual(len(formatted_prompt), 1) - - formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) - expected_output = processor.tokenizer(formatted_prompt, return_tensors=None).input_ids - self.assertListEqual(expected_output, formatted_prompt_tokenized) - - out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True) - self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"]) - - # Now test the ability to return dict - messages[0][0]["content"].append( - {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"} - ) - out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True) - self.assertTrue(self.images_input_name in out_dict) - - # should always have input_ids and attention_mask - self.assertEqual(len(out_dict["input_ids"]), 1) - self.assertEqual(len(out_dict["attention_mask"]), 1) - self.assertEqual(len(out_dict[self.images_input_name]), 71280) - - def test_image_chat_template_batched(self): - processor = self.get_processor() - if processor.chat_template is None: - self.skipTest("Processor has no chat template") - - batched_messages = [ - [ - { - "role": "user", - "content": [ - {"type": "text", "text": "What is shown in this image?"}, - ], - }, - ], - [ - { - "role": "user", - "content": [ - {"type": "text", "text": "What do you see?"}, - ], - }, - ], - ] - - formatted_prompt = processor.apply_chat_template(batched_messages, add_generation_prompt=True, tokenize=False) - self.assertEqual(len(formatted_prompt), 2) + # Test that jinja can be applied + formatted_prompt = processor.apply_chat_template(batch_messages, add_generation_prompt=True, tokenize=False) + self.assertEqual(len(formatted_prompt), batch_size) + # Test that tokenizing with template and directly with `self.tokenizer` gives same output formatted_prompt_tokenized = processor.apply_chat_template( - batched_messages, add_generation_prompt=True, tokenize=True, padding=True + batch_messages, add_generation_prompt=True, tokenize=True, return_tensors=return_tensors ) - expected_output = processor.tokenizer(formatted_prompt, return_tensors=None, padding=True).input_ids - self.assertListEqual(expected_output, formatted_prompt_tokenized) + add_special_tokens = True + if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token): + add_special_tokens = False + tok_output = processor.tokenizer( + formatted_prompt, return_tensors=return_tensors, add_special_tokens=add_special_tokens + ) + expected_output = tok_output.input_ids + self.assertListEqual(expected_output.tolist(), formatted_prompt_tokenized.tolist()) + + # Test that kwargs passed to processor's `__call__` are actually used + tokenized_prompt_100 = processor.apply_chat_template( + batch_messages, + add_generation_prompt=True, + tokenize=True, + padding="max_length", + truncation=True, + return_tensors=return_tensors, + max_length=100, + ) + self.assertEqual(len(tokenized_prompt_100[0]), 100) + + # Test that `return_dict=True` returns text related inputs in the dict + out_dict_text = processor.apply_chat_template( + batch_messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors=return_tensors, + ) + self.assertTrue(all(key in out_dict_text for key in ["input_ids", "attention_mask"])) + self.assertEqual(len(out_dict_text["input_ids"]), batch_size) + self.assertEqual(len(out_dict_text["attention_mask"]), batch_size) + + # Test that with modality URLs and `return_dict=True`, we get modality inputs in the dict + for idx, url in enumerate(input_data[:batch_size]): + batch_messages[idx][0]["content"] = [batch_messages[idx][0]["content"][0], {"type": modality, "url": url}] out_dict = processor.apply_chat_template( - batched_messages, add_generation_prompt=True, tokenize=True, return_dict=True, padding=True + batch_messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors=return_tensors, + num_frames=4, # by default no more than 4 frames, otherwise too slow ) - self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"]) + input_name = getattr(self, input_name) + self.assertTrue(input_name in out_dict) + self.assertEqual(len(out_dict["input_ids"]), batch_size) + self.assertEqual(len(out_dict["attention_mask"]), batch_size) + self.assertEqual(len(out_dict[input_name]), batch_size * 19200) - # Now test the ability to return dict - batched_messages[0][0]["content"].append( - {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"} - ) - batched_messages[1][0]["content"].append( - {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"} - ) - out_dict = processor.apply_chat_template( - batched_messages, add_generation_prompt=True, tokenize=True, return_dict=True, padding=True - ) - self.assertTrue(self.images_input_name in out_dict) - - # should always have input_ids and attention_mask - self.assertEqual(len(out_dict["input_ids"]), 2) - self.assertEqual(len(out_dict["attention_mask"]), 2) - self.assertEqual(len(out_dict[self.images_input_name]), 90480) + return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list} + for k in out_dict: + self.assertIsInstance(out_dict[k], return_tensor_to_type[return_tensors]) @require_av - def test_chat_template_video(self): + def test_apply_chat_template_video_frame_sampling(self): processor = self.get_processor() if processor.chat_template is None: self.skipTest("Processor has no chat template") @@ -331,52 +331,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase): self.assertEqual(inputs[self.images_input_name].shape[0], 800) @require_av - def test_chat_template_video_custom_sampling(self): - """ - Tests that models can pass their custom callables to sample video indices. - """ - processor = self.get_processor() - if processor.chat_template is None: - self.skipTest("Processor has no chat template") - - signature = inspect.signature(processor.__call__) - if "videos" not in {*signature.parameters.keys()} or ( - signature.parameters.get("videos") is not None - and signature.parameters["videos"].annotation == inspect._empty - ): - self.skipTest("Processor doesn't accept videos at input") - - video_file_path = hf_hub_download( - repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset" - ) - messages = [ - [ - { - "role": "user", - "content": [ - {"type": "video", "path": video_file_path}, - {"type": "text", "text": "What is shown in this video?"}, - ], - }, - ] - ] - - def dummy_sample_indices_fn(metadata, **fn_kwargs): - # sample only the first two frame always - return [0, 1] - - out_dict_with_video = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_dict=True, - sample_indices_fn=dummy_sample_indices_fn, - ) - self.assertTrue(self.videos_input_name in out_dict_with_video) - self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 14400) - - @require_av - def test_chat_template_video_special_processing(self): + def test_apply_chat_template_video_special_processing(self): """ Tests that models can use their own preprocessing to preprocess conversations. """ @@ -433,6 +388,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase): add_generation_prompt=True, tokenize=True, return_dict=True, + return_tensors="np", ) self.assertTrue(self.videos_input_name in out_dict_with_video) diff --git a/tests/models/qwen2_audio/test_processor_qwen2_audio.py b/tests/models/qwen2_audio/test_processor_qwen2_audio.py index ebd41520c3..8edf5cea7a 100644 --- a/tests/models/qwen2_audio/test_processor_qwen2_audio.py +++ b/tests/models/qwen2_audio/test_processor_qwen2_audio.py @@ -54,7 +54,7 @@ class Qwen2AudioProcessorTest(ProcessorTesterMixin, unittest.TestCase): @staticmethod def prepare_processor_dict(): return { - "chat_template": "{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}", + "chat_template": "{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or content['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}", } # Override as Qwen2AudioProcessor needs audio tokens in prompts @@ -159,29 +159,3 @@ class Qwen2AudioProcessorTest(ProcessorTesterMixin, unittest.TestCase): formatted_prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) self.assertEqual(expected_prompt, formatted_prompt) - - def test_chat_template_with_continue_final_message(self): - processor = AutoProcessor.from_pretrained(self.checkpoint) - expected_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat's that sound?<|im_end|>\n<|im_start|>assistant\nIt is the sound of " # fmt: skip - messages = [ - { - "role": "system", - "content": [{"type": "text", "text": "You are a helpful assistant."}], - }, - { - "role": "user", - "content": [ - { - "type": "audio", - "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3", - }, - {"type": "text", "text": "What's that sound?"}, - ], - }, - { - "role": "assistant", - "content": [{"type": "text", "text": "It is the sound of "}], - }, - ] - prompt = processor.apply_chat_template(messages, continue_final_message=True) - self.assertEqual(expected_prompt, prompt) diff --git a/tests/models/qwen2_vl/test_processor_qwen2_vl.py b/tests/models/qwen2_vl/test_processor_qwen2_vl.py index b1a2e99c0a..720ba2f09f 100644 --- a/tests/models/qwen2_vl/test_processor_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_processor_qwen2_vl.py @@ -17,12 +17,13 @@ import shutil import tempfile import unittest +import numpy as np import pytest from huggingface_hub import hf_hub_download from transformers import AutoProcessor, Qwen2Tokenizer from transformers.testing_utils import require_av, require_torch, require_vision -from transformers.utils import is_vision_available +from transformers.utils import is_torch_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin @@ -30,6 +31,9 @@ from ...test_processing_common import ProcessorTesterMixin if is_vision_available(): from transformers import Qwen2VLImageProcessor, Qwen2VLProcessor +if is_torch_available(): + import torch + @require_vision @require_torch @@ -116,101 +120,97 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase): self.assertListEqual(list(inputs.keys()), processor.model_input_names) - def test_image_chat_template_single(self): + @require_torch + def _test_apply_chat_template( + self, + modality: str, + batch_size: int, + return_tensors: str, + input_name: str, + processor_name: str, + input_data: list[str], + ): processor = self.get_processor() if processor.chat_template is None: self.skipTest("Processor has no chat template") - messages = [ + if processor_name not in self.processor_class.attributes: + self.skipTest(f"{processor_name} attribute not present in {self.processor_class}") + + batch_messages = [ [ { "role": "user", - "content": [ - {"type": "text", "text": "What is shown in this image?"}, - ], + "content": [{"type": "text", "text": "Describe this."}], }, ] - ] + ] * batch_size - formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) - self.assertEqual(len(formatted_prompt), 1) - - formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) - expected_output = processor.tokenizer(formatted_prompt, return_tensors=None).input_ids - self.assertListEqual(expected_output, formatted_prompt_tokenized) - - out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True) - self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"]) - - # Now test the ability to return dict - messages[0][0]["content"].append( - {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"} - ) - out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True) - self.assertTrue(self.images_input_name in out_dict) - - # should always have input_ids and attention_mask - self.assertEqual(len(out_dict["input_ids"]), 1) - self.assertEqual(len(out_dict["attention_mask"]), 1) - self.assertEqual(len(out_dict[self.images_input_name]), 71280) - - def test_image_chat_template_batched(self): - processor = self.get_processor() - if processor.chat_template is None: - self.skipTest("Processor has no chat template") - - batched_messages = [ - [ - { - "role": "user", - "content": [ - {"type": "text", "text": "What is shown in this image?"}, - ], - }, - ], - [ - { - "role": "user", - "content": [ - {"type": "text", "text": "What do you see?"}, - ], - }, - ], - ] - - formatted_prompt = processor.apply_chat_template(batched_messages, add_generation_prompt=True, tokenize=False) - self.assertEqual(len(formatted_prompt), 2) + # Test that jinja can be applied + formatted_prompt = processor.apply_chat_template(batch_messages, add_generation_prompt=True, tokenize=False) + self.assertEqual(len(formatted_prompt), batch_size) + # Test that tokenizing with template and directly with `self.tokenizer` gives same output formatted_prompt_tokenized = processor.apply_chat_template( - batched_messages, add_generation_prompt=True, tokenize=True, padding=True + batch_messages, add_generation_prompt=True, tokenize=True, return_tensors=return_tensors ) - expected_output = processor.tokenizer(formatted_prompt, return_tensors=None, padding=True).input_ids - self.assertListEqual(expected_output, formatted_prompt_tokenized) + add_special_tokens = True + if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token): + add_special_tokens = False + tok_output = processor.tokenizer( + formatted_prompt, return_tensors=return_tensors, add_special_tokens=add_special_tokens + ) + expected_output = tok_output.input_ids + self.assertListEqual(expected_output.tolist(), formatted_prompt_tokenized.tolist()) + + # Test that kwargs passed to processor's `__call__` are actually used + tokenized_prompt_100 = processor.apply_chat_template( + batch_messages, + add_generation_prompt=True, + tokenize=True, + padding="max_length", + truncation=True, + return_tensors=return_tensors, + max_length=100, + ) + self.assertEqual(len(tokenized_prompt_100[0]), 100) + + # Test that `return_dict=True` returns text related inputs in the dict + out_dict_text = processor.apply_chat_template( + batch_messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors=return_tensors, + ) + self.assertTrue(all(key in out_dict_text for key in ["input_ids", "attention_mask"])) + self.assertEqual(len(out_dict_text["input_ids"]), batch_size) + self.assertEqual(len(out_dict_text["attention_mask"]), batch_size) + + # Test that with modality URLs and `return_dict=True`, we get modality inputs in the dict + for idx, url in enumerate(input_data[:batch_size]): + batch_messages[idx][0]["content"] = [batch_messages[idx][0]["content"][0], {"type": modality, "url": url}] out_dict = processor.apply_chat_template( - batched_messages, add_generation_prompt=True, tokenize=True, return_dict=True, padding=True + batch_messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors=return_tensors, + num_frames=4, # by default no more than 4 frames, otherwise too slow ) - self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"]) + input_name = getattr(self, input_name) + self.assertTrue(input_name in out_dict) + self.assertEqual(len(out_dict["input_ids"]), batch_size) + self.assertEqual(len(out_dict["attention_mask"]), batch_size) + self.assertEqual(len(out_dict[input_name]), batch_size * 19200) - # Now test the ability to return dict - batched_messages[0][0]["content"].append( - {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"} - ) - batched_messages[1][0]["content"].append( - {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"} - ) - out_dict = processor.apply_chat_template( - batched_messages, add_generation_prompt=True, tokenize=True, return_dict=True, padding=True - ) - self.assertTrue(self.images_input_name in out_dict) - - # should always have input_ids and attention_mask - self.assertEqual(len(out_dict["input_ids"]), 2) - self.assertEqual(len(out_dict["attention_mask"]), 2) - self.assertEqual(len(out_dict[self.images_input_name]), 90480) + return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list} + for k in out_dict: + self.assertIsInstance(out_dict[k], return_tensor_to_type[return_tensors]) @require_av - def test_chat_template_video(self): + def test_apply_chat_template_video_frame_sampling(self): processor = self.get_processor() if processor.chat_template is None: self.skipTest("Processor has no chat template") @@ -312,52 +312,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase): self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280) @require_av - def test_chat_template_video_custom_sampling(self): - """ - Tests that models can pass their custom callables to sample video indices. - """ - processor = self.get_processor() - if processor.chat_template is None: - self.skipTest("Processor has no chat template") - - signature = inspect.signature(processor.__call__) - if "videos" not in {*signature.parameters.keys()} or ( - signature.parameters.get("videos") is not None - and signature.parameters["videos"].annotation == inspect._empty - ): - self.skipTest("Processor doesn't accept videos at input") - - video_file_path = hf_hub_download( - repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset" - ) - messages = [ - [ - { - "role": "user", - "content": [ - {"type": "video", "path": video_file_path}, - {"type": "text", "text": "What is shown in this video?"}, - ], - }, - ] - ] - - def dummy_sample_indices_fn(metadata, **fn_kwargs): - # sample only the first two frame always - return [0, 1] - - out_dict_with_video = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_dict=True, - sample_indices_fn=dummy_sample_indices_fn, - ) - self.assertTrue(self.videos_input_name in out_dict_with_video) - self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 14400) - - @require_av - def test_chat_template_video_special_processing(self): + def test_apply_chat_template_video_special_processing(self): """ Tests that models can use their own preprocessing to preprocess conversations. """ @@ -414,6 +369,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase): add_generation_prompt=True, tokenize=True, return_dict=True, + return_tensors="np", ) self.assertTrue(self.videos_input_name in out_dict_with_video) diff --git a/tests/models/shieldgemma2/test_processing_shieldgemma2.py b/tests/models/shieldgemma2/test_processing_shieldgemma2.py index 5574fdfe32..86d316fd88 100644 --- a/tests/models/shieldgemma2/test_processing_shieldgemma2.py +++ b/tests/models/shieldgemma2/test_processing_shieldgemma2.py @@ -162,29 +162,14 @@ class ShieldGemma2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): self.skipTest("Processor has no chat template") images = self.prepare_image_inputs(batch_size=2) - print(images) processed_inputs = processor(images=images) self.assertEqual(len(processed_inputs[self.text_input_name]), 6) self.assertEqual(len(processed_inputs[self.images_input_name]), 6) # TODO(ryanmullins): Adapt this test for ShieldGemma 2 + @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")]) @unittest.skip("ShieldGemma 2 chat template requires different message structure from parent.") - def test_image_chat_template_accepts_processing_kwargs(self): - pass - - # TODO(ryanmullins): Adapt this test for ShieldGemma 2 - @unittest.skip("ShieldGemma 2 chat template requires different message structure from parent.") - def test_image_chat_template_batched(self): - pass - - # TODO(ryanmullins): Adapt this test for ShieldGemma 2 - @unittest.skip("ShieldGemma 2 chat template requires different message structure from parent.") - def test_image_chat_template_dict_torch(self): - pass - - # TODO(ryanmullins): Adapt this test for ShieldGemma 2 - @unittest.skip("ShieldGemma 2 chat template requires different message structure from parent.") - def test_image_chat_template_single(self): + def test_apply_chat_template_image(self, batch_size: int, return_tensors: str): pass # TODO(ryanmullins): Adapt this test for ShieldGemma 2 diff --git a/tests/models/smolvlm/test_processor_smolvlm.py b/tests/models/smolvlm/test_processor_smolvlm.py index e06b74dca7..fad6e9489f 100644 --- a/tests/models/smolvlm/test_processor_smolvlm.py +++ b/tests/models/smolvlm/test_processor_smolvlm.py @@ -368,12 +368,12 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase): ) self.assertEqual(rendered, expected_rendered) - @unittest.skip(reason="Broken from common. Fixing TODO @zucchini-nlp @molbap") - def test_chat_template_video_special_processing(self): + @unittest.skip(reason="SmolVLM replaced `type=video` with `type=image` in chat templates") + def test_apply_chat_template_video_special_processing(self): pass @require_av - def test_chat_template_video(self): + def test_apply_chat_template_video_frame_sampling(self): # overriden because SmolVLM has special preprocessing for videos processor = self.get_processor() if processor.chat_template is None: @@ -401,11 +401,12 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase): tokenize=True, return_dict=True, num_frames=num_frames, + return_tensors="np", ) self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1) # SmolVLM doesn't sample `num_frames` exactly, by uses other sampling method - self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 10) + self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 3) # Load with `video_fps` arg video_fps = 1 @@ -415,6 +416,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase): tokenize=True, return_dict=True, video_fps=video_fps, + return_tensors="np", ) self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1) diff --git a/tests/models/vipllava/test_processor_vipllava.py b/tests/models/vipllava/test_processor_vipllava.py deleted file mode 100644 index 0ddf569a60..0000000000 --- a/tests/models/vipllava/test_processor_vipllava.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2024 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import unittest - -from transformers.testing_utils import require_vision -from transformers.utils import is_vision_available - - -if is_vision_available(): - from transformers import AutoProcessor - - -@require_vision -class LlavaProcessorTest(unittest.TestCase): - def test_chat_template(self): - processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf") - expected_prompt = "###Human: \nWhat is shown in this image?###Assistant:" - - messages = [ - { - "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": "What is shown in this image?"}, - ], - }, - ] - - formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True) - self.assertEqual(expected_prompt, formatted_prompt) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 6daea1fa3b..f016001c1c 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -22,6 +22,7 @@ from typing import Optional import numpy as np from huggingface_hub import hf_hub_download +from parameterized import parameterized from transformers.models.auto.processing_auto import processor_class_from_name from transformers.processing_utils import Unpack @@ -44,6 +45,22 @@ if is_torch_available(): import torch +MODALITY_INPUT_DATA = { + "images": [ + "http://images.cocodataset.org/val2017/000000039769.jpg", + "http://images.cocodataset.org/val2017/000000039769.jpg", + ], + "videos": [ + "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4", + ["https://www.ilankelman.org/stopsigns/australia.jpg", "https://www.ilankelman.org/stopsigns/australia.jpg"], + ], + "audio": [ + "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3", + "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav", + ], +} + + def prepare_image_inputs(): """This function prepares a list of PIL images""" image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] @@ -729,7 +746,7 @@ class ProcessorTesterMixin: ) def test_chat_template_save_loading(self): - processor = self.get_processor() + processor = self.processor_class.from_pretrained(self.tmpdirname) signature = inspect.signature(processor.__init__) if "chat_template" not in {*signature.parameters.keys()}: self.skipTest("Processor doesn't accept chat templates at input") @@ -756,210 +773,133 @@ class ProcessorTesterMixin: # the reloaded tokenizer should get the chat template as well self.assertEqual(reloaded_processor.chat_template, reloaded_processor.tokenizer.chat_template) - def test_image_chat_template_single(self): + @require_torch + def _test_apply_chat_template( + self, + modality: str, + batch_size: int, + return_tensors: str, + input_name: str, + processor_name: str, + input_data: list[str], + ): processor = self.get_processor() if processor.chat_template is None: self.skipTest("Processor has no chat template") - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") + if processor_name not in self.processor_class.attributes: + self.skipTest(f"{processor_name} attribute not present in {self.processor_class}") - messages = [ + # some models have only Fast image processor + if getattr(processor, processor_name).__class__.__name__.endswith("Fast"): + return_tensors = "pt" + + batch_messages = [ [ { "role": "user", - "content": [ - {"type": "text", "text": "What is shown in this image?"}, - ], + "content": [{"type": "text", "text": "Describe this."}], }, ] - ] + ] * batch_size - formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) - self.assertEqual(len(formatted_prompt), 1) + # Test that jinja can be applied + formatted_prompt = processor.apply_chat_template(batch_messages, add_generation_prompt=True, tokenize=False) + self.assertEqual(len(formatted_prompt), batch_size) + # Test that tokenizing with template and directly with `self.tokenizer` gives same output formatted_prompt_tokenized = processor.apply_chat_template( - messages, add_generation_prompt=True, tokenize=True, return_tensors=None + batch_messages, add_generation_prompt=True, tokenize=True, return_tensors=return_tensors ) add_special_tokens = True if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token): add_special_tokens = False - expected_output = processor.tokenizer( - formatted_prompt, return_tensors=None, add_special_tokens=add_special_tokens - ).input_ids - self.assertListEqual(expected_output, formatted_prompt_tokenized) - - out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True) - self.assertTrue(all(key in out_dict for key in ["input_ids", "attention_mask"])) - - # Now test the ability to return dict - messages[0][0]["content"].append( - {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"} + tok_output = processor.tokenizer( + formatted_prompt, return_tensors=return_tensors, add_special_tokens=add_special_tokens ) - out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True) - self.assertTrue(self.images_input_name in out_dict) + expected_output = tok_output.input_ids + self.assertListEqual(expected_output.tolist(), formatted_prompt_tokenized.tolist()) - # should always have input_ids and attention_mask - self.assertEqual(len(out_dict["input_ids"]), 1) - self.assertEqual(len(out_dict["attention_mask"]), 1) - self.assertEqual(len(out_dict[self.images_input_name]), 1) - - def test_image_chat_template_batched(self): - processor = self.get_processor() - if processor.chat_template is None: - self.skipTest("Processor has no chat template") - - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - - batched_messages = [ - [ - { - "role": "user", - "content": [ - {"type": "text", "text": "What is shown in this image?"}, - ], - }, - ], - [ - { - "role": "user", - "content": [ - {"type": "text", "text": "What do you see?"}, - ], - }, - ], - ] - - formatted_prompt = processor.apply_chat_template(batched_messages, add_generation_prompt=True, tokenize=False) - self.assertEqual(len(formatted_prompt), 2) - - formatted_prompt_tokenized = processor.apply_chat_template( - batched_messages, add_generation_prompt=True, tokenize=True, padding=True, return_tensors=None - ) - add_special_tokens = True - if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token): - add_special_tokens = False - expected_output = processor.tokenizer( - formatted_prompt, - return_tensors=None, - padding=True, - add_special_tokens=add_special_tokens, - ).input_ids - self.assertListEqual(expected_output, formatted_prompt_tokenized) - - out_dict = processor.apply_chat_template( - batched_messages, - add_generation_prompt=True, - tokenize=True, - return_dict=True, - padding=True, - ) - self.assertTrue(all(key in out_dict for key in ["input_ids", "attention_mask"])) - - # Now test the ability to return dict - batched_messages[0][0]["content"].append( - {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"} - ) - batched_messages[1][0]["content"].append( - {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"} - ) - out_dict = processor.apply_chat_template( - batched_messages, add_generation_prompt=True, tokenize=True, return_dict=True, padding=True - ) - self.assertTrue(self.images_input_name in out_dict) - - # should always have input_ids and attention_mask - self.assertEqual(len(out_dict["input_ids"]), 2) - self.assertEqual(len(out_dict["attention_mask"]), 2) - self.assertEqual(len(out_dict[self.images_input_name]), 2) - - def test_image_chat_template_accepts_processing_kwargs(self): - processor = self.get_processor() - if processor.chat_template is None: - self.skipTest("Processor has no chat template") - - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - - messages = [ - [ - { - "role": "user", - "content": [ - {"type": "text", "text": "What is shown in this image?"}, - ], - }, - ] - ] - - formatted_prompt_tokenized = processor.apply_chat_template( - messages, + # Test that kwargs passed to processor's `__call__` are actually used + tokenized_prompt_100 = processor.apply_chat_template( + batch_messages, add_generation_prompt=True, tokenize=True, padding="max_length", truncation=True, - max_length=50, + return_tensors=return_tensors, + max_length=100, ) - self.assertEqual(len(formatted_prompt_tokenized[0]), 50) + self.assertEqual(len(tokenized_prompt_100[0]), 100) - formatted_prompt_tokenized = processor.apply_chat_template( - messages, + # Test that `return_dict=True` returns text related inputs in the dict + out_dict_text = processor.apply_chat_template( + batch_messages, add_generation_prompt=True, tokenize=True, - truncation=True, - max_length=5, + return_dict=True, + return_tensors=return_tensors, ) - self.assertEqual(len(formatted_prompt_tokenized[0]), 5) + self.assertTrue(all(key in out_dict_text for key in ["input_ids", "attention_mask"])) + self.assertEqual(len(out_dict_text["input_ids"]), batch_size) + self.assertEqual(len(out_dict_text["attention_mask"]), batch_size) + + # Test that with modality URLs and `return_dict=True`, we get modality inputs in the dict + for idx, url in enumerate(input_data[:batch_size]): + batch_messages[idx][0]["content"] = [batch_messages[idx][0]["content"][0], {"type": modality, "url": url}] - # Now test the ability to return dict - messages[0][0]["content"].append( - {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"} - ) out_dict = processor.apply_chat_template( - messages, + batch_messages, add_generation_prompt=True, tokenize=True, return_dict=True, - do_rescale=True, - rescale_factor=-1, - return_tensors="np", + return_tensors=return_tensors, + num_frames=4, # by default no more than 4 frames, otherwise too slow ) - self.assertLessEqual(out_dict[self.images_input_name][0][0].mean(), 0) + input_name = getattr(self, input_name) + self.assertTrue(input_name in out_dict) + self.assertEqual(len(out_dict["input_ids"]), batch_size) + self.assertEqual(len(out_dict["attention_mask"]), batch_size) + self.assertEqual(len(out_dict[input_name]), batch_size) - @require_torch - def test_image_chat_template_dict_torch(self): - processor = self.get_processor() - if processor.chat_template is None: - self.skipTest("Processor has no chat template") + return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list} + for k in out_dict: + self.assertIsInstance(out_dict[k], return_tensor_to_type[return_tensors]) - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - - messages = [ - { - "role": "user", - "content": [ - {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}, - {"type": "text", "text": "What is shown in this image?"}, - ], - }, - ] - - out_dict_tensors = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_dict=True, - return_tensors="pt", - ) - self.assertTrue(self.images_input_name in out_dict_tensors) - for k in out_dict_tensors: - self.assertIsInstance(out_dict_tensors[k], torch.Tensor) + # Test continue from final message + assistant_message = { + "role": "assistant", + "content": [{"type": "text", "text": "It is the sound of"}], + } + for idx, url in enumerate(input_data[:batch_size]): + batch_messages[idx] = batch_messages[idx] + [assistant_message] + continue_prompt = processor.apply_chat_template(batch_messages, continue_final_message=True, tokenize=False) + for prompt in continue_prompt: + self.assertTrue(prompt.endswith("It is the sound of")) # no `eos` token at the end @require_av - def test_chat_template_video(self): + @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")]) + def test_apply_chat_template_audio(self, batch_size: int, return_tensors: str): + self._test_apply_chat_template( + "audio", batch_size, return_tensors, "audio_input_name", "feature_extracttor", MODALITY_INPUT_DATA["audio"] + ) + + @require_librosa + @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")]) + def test_apply_chat_template_video(self, batch_size: int, return_tensors: str): + self._test_apply_chat_template( + "video", batch_size, return_tensors, "videos_input_name", "video_processor", MODALITY_INPUT_DATA["videos"] + ) + + @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")]) + def test_apply_chat_template_image(self, batch_size: int, return_tensors: str): + self._test_apply_chat_template( + "image", batch_size, return_tensors, "images_input_name", "image_processor", MODALITY_INPUT_DATA["images"] + ) + + def test_apply_chat_template_video_frame_sampling(self): processor = self.get_processor() + if processor.chat_template is None: self.skipTest("Processor has no chat template") @@ -975,37 +915,16 @@ class ProcessorTesterMixin: { "role": "user", "content": [ - {"type": "video"}, + { + "type": "video", + "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4", + }, {"type": "text", "text": "What is shown in this video?"}, ], }, ] ] - formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) - self.assertEqual(len(formatted_prompt), 1) - - formatted_prompt_tokenized = processor.apply_chat_template( - messages, add_generation_prompt=True, tokenize=True, return_tensors=None - ) - add_special_tokens = True - if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token): - add_special_tokens = False - expected_output = processor.tokenizer( - formatted_prompt, - return_tensors=None, - add_special_tokens=add_special_tokens, - ).input_ids - self.assertListEqual(expected_output, formatted_prompt_tokenized) - - out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True) - self.assertTrue(all(key in out_dict for key in ["input_ids", "attention_mask"])) - - # Add video URL for return dict and load with `num_frames` arg - messages[0][0]["content"][0] = { - "type": "video", - "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4", - } num_frames = 3 out_dict_with_video = processor.apply_chat_template( messages, @@ -1013,6 +932,7 @@ class ProcessorTesterMixin: tokenize=True, return_dict=True, num_frames=num_frames, + return_tensors="np", ) self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1) @@ -1026,6 +946,7 @@ class ProcessorTesterMixin: tokenize=True, return_dict=True, video_fps=video_fps, + return_tensors="np", ) self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1) @@ -1073,53 +994,7 @@ class ProcessorTesterMixin: self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 2) @require_av - def test_chat_template_video_custom_sampling(self): - """ - Tests that models can pass their custom callables to sample video indices. - """ - processor = self.get_processor() - if processor.chat_template is None: - self.skipTest("Processor has no chat template") - - signature = inspect.signature(processor.__call__) - if "videos" not in {*signature.parameters.keys()} or ( - signature.parameters.get("videos") is not None - and signature.parameters["videos"].annotation == inspect._empty - ): - self.skipTest("Processor doesn't accept videos at input") - - video_file_path = hf_hub_download( - repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset" - ) - messages = [ - [ - { - "role": "user", - "content": [ - {"type": "video", "path": video_file_path}, - {"type": "text", "text": "What is shown in this video?"}, - ], - }, - ] - ] - - def dummy_sample_indices_fn(metadata, **fn_kwargs): - # sample only the first two frame always - return [0, 1] - - out_dict_with_video = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_dict=True, - sample_indices_fn=dummy_sample_indices_fn, - ) - self.assertTrue(self.videos_input_name in out_dict_with_video) - self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1) - self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 2) - - @require_av - def test_chat_template_video_special_processing(self): + def test_apply_chat_template_video_special_processing(self): """ Tests that models can use their own preprocessing to preprocess conversations. """ @@ -1176,6 +1051,7 @@ class ProcessorTesterMixin: add_generation_prompt=True, tokenize=True, return_dict=True, + return_tensors="np", ) self.assertTrue(self.videos_input_name in out_dict_with_video) @@ -1187,7 +1063,7 @@ class ProcessorTesterMixin: @require_librosa @require_av - def test_audio_chat_template_from_video(self): + def test_chat_template_audio_from_video(self): processor = self.get_processor() if processor.chat_template is None: self.skipTest("Processor has no chat template") @@ -1241,124 +1117,10 @@ class ProcessorTesterMixin: load_audio_from_video=True, ) self.assertTrue(self.audio_input_name in out_dict) - self.assertTrue(self.video_input_name in out_dict) + self.assertTrue(self.videos_input_name in out_dict) # should always have input_ids and attention_mask self.assertEqual(len(out_dict["input_ids"]), 1) # batch-size=1 self.assertEqual(len(out_dict["attention_mask"]), 1) # batch-size=1 self.assertEqual(len(out_dict[self.audio_input_name]), 2) # 2 audios in the conversation - self.assertEqual(len(out_dict[self.video_input_name]), 1) # 1 video in the conversation - - @require_librosa - def test_audio_chat_template_single(self): - processor = self.get_processor() - if processor.chat_template is None: - self.skipTest("Processor has no chat template") - - if "feature_extractor" not in self.processor_class.attributes: - self.skipTest(f"feature_extractor attribute not present in {self.processor_class}") - - messages = [ - { - "role": "system", - "content": [{"type": "text", "text": "You are a helpful assistant."}], - }, - { - "role": "user", - "content": [ - { - "type": "audio", - }, - {"type": "text", "text": "What's that sound?"}, - ], - }, - { - "role": "assistant", - "content": [{"type": "text", "text": "It is the sound of glass shattering."}], - }, - { - "role": "user", - "content": [ - { - "type": "audio", - }, - {"type": "text", "text": "How about this one?"}, - ], - }, - ] - - formatted_prompt = processor.apply_chat_template([messages], add_generation_prompt=True, tokenize=False) - self.assertEqual(len(formatted_prompt), 1) # batch size=1 - - formatted_prompt_tokenized = processor.apply_chat_template( - messages, add_generation_prompt=True, tokenize=True, return_tensors=None - ) - expected_output = processor.tokenizer(formatted_prompt, return_tensors=None).input_ids - self.assertListEqual(expected_output, formatted_prompt_tokenized) - - messages[1]["content"][0]["audio"] = ( - "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3" - ) - messages[3]["content"][0]["audio"] = ( - "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3" - ) - out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True) - self.assertTrue(self.audio_input_name in out_dict) - - # should always have input_ids and attention_mask - self.assertEqual(len(out_dict["input_ids"]), 1) # batch-size=1 - self.assertEqual(len(out_dict["attention_mask"]), 1) # batch-size=1 - self.assertEqual(len(out_dict[self.audio_input_name]), 2) # 2 audios in the conversation - - @require_torch - @require_librosa - def test_audio_chat_template_dict_torch(self): - processor = self.get_processor() - if processor.chat_template is None: - self.skipTest("Processor has no chat template") - - if "feature_extractor" not in self.processor_class.attributes: - self.skipTest(f"feature_extractor attribute not present in {self.processor_class}") - - messages = [ - { - "role": "system", - "content": [{"type": "text", "text": "You are a helpful assistant."}], - }, - { - "role": "user", - "content": [ - { - "type": "audio", - "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3", - }, - {"type": "text", "text": "What's that sound?"}, - ], - }, - { - "role": "assistant", - "content": [{"type": "text", "text": "It is the sound of glass shattering."}], - }, - { - "role": "user", - "content": [ - { - "type": "audio", - "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav", - }, - {"type": "text", "text": "How about this one?"}, - ], - }, - ] - - out_dict_tensors = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_dict=True, - return_tensors="pt", - ) - - self.assertTrue(self.audio_input_name in out_dict_tensors) - for k in out_dict_tensors: - self.assertIsInstance(out_dict_tensors[k], torch.Tensor) + self.assertEqual(len(out_dict[self.videos_input_name]), 1) # 1 video in the conversation