From a563999a024306c6f6abec71012d0b462da3d6b2 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Fri, 11 Apr 2025 13:32:19 +0200 Subject: [PATCH] [processor] clean up mulitmodal tests (#37362) * clkea up mulitmodal processor tests * fixup * fix tests * fix one last test * forgot --- .../models/aria/image_processing_aria.py | 39 +++- src/transformers/models/aria/modular_aria.py | 34 ++- .../models/colpali/processing_colpali.py | 2 + .../idefics/image_processing_idefics.py | 26 ++- .../models/paligemma/processing_paligemma.py | 2 + src/transformers/processing_utils.py | 1 - tests/models/aria/test_processor_aria.py | 219 ++++-------------- .../aya_vision/test_processor_aya_vision.py | 16 +- .../chameleon/test_processor_chameleon.py | 7 +- tests/models/emu3/test_processor_emu3.py | 6 +- tests/models/fuyu/test_processor_fuyu.py | 2 +- tests/models/gemma3/test_processing_gemma3.py | 17 +- .../got_ocr2/test_processor_got_ocr2.py | 1 + .../test_processor_grounding_dino.py | 2 +- .../models/idefics/test_processor_idefics.py | 136 ----------- .../idefics2/test_processor_idefics2.py | 19 +- .../idefics3/test_processor_idefics3.py | 158 +------------ tests/models/llama4/test_processor_llama4.py | 22 +- tests/models/llava/test_processor_llava.py | 11 +- .../llava_next/test_processor_llava_next.py | 11 +- .../test_processor_llava_next_video.py | 15 +- .../test_processor_llava_onevision.py | 11 +- .../mistral3/test_processor_mistral3.py | 66 +++--- tests/models/mllama/test_processor_mllama.py | 17 +- .../paligemma/test_processor_paligemma.py | 3 +- .../qwen2_5_vl/test_processor_qwen2_5_vl.py | 26 ++- .../qwen2_audio/test_processor_qwen2_audio.py | 16 +- .../qwen2_vl/test_processor_qwen2_vl.py | 22 +- .../models/smolvlm/test_processor_smolvlm.py | 137 +---------- tests/test_processing_common.py | 77 +++--- 30 files changed, 304 insertions(+), 817 deletions(-) diff --git a/src/transformers/models/aria/image_processing_aria.py b/src/transformers/models/aria/image_processing_aria.py index 0a04d8117d..364f8f70df 100644 --- a/src/transformers/models/aria/image_processing_aria.py +++ b/src/transformers/models/aria/image_processing_aria.py @@ -31,12 +31,16 @@ from ...image_utils import ( PILImageResampling, get_image_size, infer_channel_dimension_format, + is_scaled_image, make_flat_list_of_images, to_numpy_array, valid_images, validate_preprocess_arguments, ) -from ...utils import TensorType +from ...utils import TensorType, logging + + +logger = logging.get_logger(__name__) def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]: @@ -104,6 +108,12 @@ class AriaImageProcessor(BaseImageProcessor): Whether to split the image. do_convert_rgb (`bool`, *optional*, defaults to `True`): Whether to convert the image to RGB. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. do_normalize (`bool`, *optional*, defaults to `True`): Whether to normalize the image. resample (PILImageResampling, *optional*, defaults to `BICUBIC`): @@ -121,6 +131,8 @@ class AriaImageProcessor(BaseImageProcessor): split_resolutions: Optional[List[Tuple[int, int]]] = None, split_image: Optional[bool] = False, do_convert_rgb: Optional[bool] = True, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, do_normalize: Optional[bool] = True, resample: PILImageResampling = PILImageResampling.BICUBIC, **kwargs, @@ -141,6 +153,8 @@ class AriaImageProcessor(BaseImageProcessor): split_resolutions = [(el[0] * 490, el[1] * 490) for el in split_resolutions] self.split_resolutions = split_resolutions self.do_convert_rgb = do_convert_rgb + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor self.do_normalize = do_normalize self.resample = resample @@ -153,6 +167,8 @@ class AriaImageProcessor(BaseImageProcessor): min_image_size: Optional[int] = None, split_image: Optional[bool] = None, do_convert_rgb: Optional[bool] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, do_normalize: Optional[bool] = None, resample: PILImageResampling = None, return_tensors: Optional[Union[str, TensorType]] = "pt", @@ -177,6 +193,10 @@ class AriaImageProcessor(BaseImageProcessor): Whether to split the image. do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb` (True)): Whether to convert the image to RGB. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. do_normalize (`bool`, *optional*, defaults to `self.do_normalize` (True)): Whether to normalize the image. resample (PILImageResampling, *optional*, defaults to `self.resample` (BICUBIC)): @@ -217,6 +237,8 @@ class AriaImageProcessor(BaseImageProcessor): min_image_size = min_image_size if min_image_size is not None else self.min_image_size split_image = split_image if split_image is not None else self.split_image do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor do_normalize = do_normalize if do_normalize is not None else self.do_normalize resample = resample if resample is not None else self.resample @@ -236,6 +258,8 @@ class AriaImageProcessor(BaseImageProcessor): image_mean=image_mean, image_std=image_std, resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, ) if do_convert_rgb: @@ -244,6 +268,12 @@ class AriaImageProcessor(BaseImageProcessor): # All transformations expect numpy arrays. images = [to_numpy_array(image) for image in images] + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + if input_data_format is None: # We assume that all images have the same channel dimension format. input_data_format = infer_channel_dimension_format(images[0]) @@ -297,9 +327,14 @@ class AriaImageProcessor(BaseImageProcessor): pixel_mask[: new_size[0], : new_size[1]] = 1 pixel_masks.append(pixel_mask) + if do_rescale: + crop_image_padded = self.rescale( + image=crop_image_padded, scale=rescale_factor, input_data_format=input_data_format + ) + if do_normalize: crop_image_padded = self.normalize( - crop_image_padded / 255.0, + crop_image_padded, self.image_mean, self.image_std, data_format=input_data_format, diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index 3f38c87b5d..fa0858cde3 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -28,6 +28,7 @@ from ...image_utils import ( PILImageResampling, get_image_size, infer_channel_dimension_format, + is_scaled_image, make_flat_list_of_images, to_numpy_array, valid_images, @@ -495,6 +496,12 @@ class AriaImageProcessor(BaseImageProcessor): Whether to split the image. do_convert_rgb (`bool`, *optional*, defaults to `True`): Whether to convert the image to RGB. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. do_normalize (`bool`, *optional*, defaults to `True`): Whether to normalize the image. resample (PILImageResampling, *optional*, defaults to `BICUBIC`): @@ -512,6 +519,8 @@ class AriaImageProcessor(BaseImageProcessor): split_resolutions: Optional[List[Tuple[int, int]]] = None, split_image: Optional[bool] = False, do_convert_rgb: Optional[bool] = True, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, do_normalize: Optional[bool] = True, resample: PILImageResampling = PILImageResampling.BICUBIC, **kwargs, @@ -532,6 +541,8 @@ class AriaImageProcessor(BaseImageProcessor): split_resolutions = [(el[0] * 490, el[1] * 490) for el in split_resolutions] self.split_resolutions = split_resolutions self.do_convert_rgb = do_convert_rgb + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor self.do_normalize = do_normalize self.resample = resample @@ -544,6 +555,8 @@ class AriaImageProcessor(BaseImageProcessor): min_image_size: Optional[int] = None, split_image: Optional[bool] = None, do_convert_rgb: Optional[bool] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, do_normalize: Optional[bool] = None, resample: PILImageResampling = None, return_tensors: Optional[Union[str, TensorType]] = "pt", @@ -568,6 +581,10 @@ class AriaImageProcessor(BaseImageProcessor): Whether to split the image. do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb` (True)): Whether to convert the image to RGB. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. do_normalize (`bool`, *optional*, defaults to `self.do_normalize` (True)): Whether to normalize the image. resample (PILImageResampling, *optional*, defaults to `self.resample` (BICUBIC)): @@ -608,6 +625,8 @@ class AriaImageProcessor(BaseImageProcessor): min_image_size = min_image_size if min_image_size is not None else self.min_image_size split_image = split_image if split_image is not None else self.split_image do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor do_normalize = do_normalize if do_normalize is not None else self.do_normalize resample = resample if resample is not None else self.resample @@ -627,6 +646,8 @@ class AriaImageProcessor(BaseImageProcessor): image_mean=image_mean, image_std=image_std, resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, ) if do_convert_rgb: @@ -635,6 +656,12 @@ class AriaImageProcessor(BaseImageProcessor): # All transformations expect numpy arrays. images = [to_numpy_array(image) for image in images] + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + if input_data_format is None: # We assume that all images have the same channel dimension format. input_data_format = infer_channel_dimension_format(images[0]) @@ -688,9 +715,14 @@ class AriaImageProcessor(BaseImageProcessor): pixel_mask[: new_size[0], : new_size[1]] = 1 pixel_masks.append(pixel_mask) + if do_rescale: + crop_image_padded = self.rescale( + image=crop_image_padded, scale=rescale_factor, input_data_format=input_data_format + ) + if do_normalize: crop_image_padded = self.normalize( - crop_image_padded / 255.0, + crop_image_padded, self.image_mean, self.image_std, data_format=input_data_format, diff --git a/src/transformers/models/colpali/processing_colpali.py b/src/transformers/models/colpali/processing_colpali.py index eeb14901f7..2e6a68ca7a 100644 --- a/src/transformers/models/colpali/processing_colpali.py +++ b/src/transformers/models/colpali/processing_colpali.py @@ -118,8 +118,10 @@ class ColPaliProcessor(ProcessorMixin): tokens_to_add = {"additional_special_tokens": [image_token]} tokenizer.add_special_tokens(tokens_to_add) self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN) + self.image_token = IMAGE_TOKEN else: self.image_token_id = tokenizer.image_token_id + self.image_token = tokenizer.image_token tokenizer.add_tokens(EXTRA_TOKENS) tokenizer.add_bos_token = False diff --git a/src/transformers/models/idefics/image_processing_idefics.py b/src/transformers/models/idefics/image_processing_idefics.py index 2b317da05d..768ef893d2 100644 --- a/src/transformers/models/idefics/image_processing_idefics.py +++ b/src/transformers/models/idefics/image_processing_idefics.py @@ -65,6 +65,12 @@ class IdeficsImageProcessor(BaseImageProcessor): Can be overridden by the `image_std` parameter in the `preprocess` method. image_num_channels (`int`, *optional*, defaults to 3): Number of image channels. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. """ model_input_names = ["pixel_values"] @@ -75,14 +81,18 @@ class IdeficsImageProcessor(BaseImageProcessor): image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, image_num_channels: Optional[int] = 3, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, **kwargs, ) -> None: super().__init__(**kwargs) self.image_size = image_size self.image_num_channels = image_num_channels - self.image_mean = image_mean - self.image_std = image_std + self.image_mean = image_mean if image_mean is not None else IDEFICS_STANDARD_MEAN + self.image_std = image_std if image_std is not None else IDEFICS_STANDARD_STD + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor def preprocess( self, @@ -92,6 +102,8 @@ class IdeficsImageProcessor(BaseImageProcessor): image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, transform: Callable = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH, **kwargs, ) -> TensorType: @@ -117,6 +129,12 @@ class IdeficsImageProcessor(BaseImageProcessor): A custom transform function that accepts a single image can be passed for training. For example, `torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is assumed - and then a preset of inference-specific transforms will be applied to the images + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. Returns: a PyTorch tensor of the processed images @@ -126,6 +144,8 @@ class IdeficsImageProcessor(BaseImageProcessor): image_num_channels = image_num_channels if image_num_channels is not None else self.image_num_channels image_mean = image_mean if image_mean is not None else self.image_mean image_std = image_std if image_std is not None else self.image_std + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor size = (image_size, image_size) if isinstance(images, list) and len(images) == 0: @@ -160,7 +180,7 @@ class IdeficsImageProcessor(BaseImageProcessor): # further transforms expect numpy arrays images = [to_numpy_array(x) for x in images] images = [resize(x, size, resample=PILImageResampling.BICUBIC) for x in images] - images = [self.rescale(image=image, scale=1 / 255) for image in images] + images = [self.rescale(image=image, scale=rescale_factor) for image in images] images = [self.normalize(x, mean=image_mean, std=image_std) for x in images] images = [to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images] images = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)["pixel_values"] diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py index f988d43583..f389487c2b 100644 --- a/src/transformers/models/paligemma/processing_paligemma.py +++ b/src/transformers/models/paligemma/processing_paligemma.py @@ -141,8 +141,10 @@ class PaliGemmaProcessor(ProcessorMixin): tokens_to_add = {"additional_special_tokens": [image_token]} tokenizer.add_special_tokens(tokens_to_add) self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN) + self.image_token = IMAGE_TOKEN else: self.image_token_id = tokenizer.image_token_id + self.image_token = tokenizer.image_token tokenizer.add_tokens(EXTRA_TOKENS) tokenizer.add_bos_token = False diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index d63eab7938..17e41055c7 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -1086,7 +1086,6 @@ class ProcessorMixin(PushToHubMixin): args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs) - processor_dict.update({k: v for k, v in kwargs.items() if k in processor_dict.keys()}) return cls.from_args_and_dict(args, processor_dict, **kwargs) @classmethod diff --git a/tests/models/aria/test_processor_aria.py b/tests/models/aria/test_processor_aria.py index ac222e1505..08a6c5ba78 100644 --- a/tests/models/aria/test_processor_aria.py +++ b/tests/models/aria/test_processor_aria.py @@ -16,7 +16,6 @@ import shutil import tempfile import unittest from io import BytesIO -from typing import Optional import numpy as np import requests @@ -41,7 +40,7 @@ class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase): @classmethod def setUpClass(cls): cls.tmpdirname = tempfile.mkdtemp() - processor = AriaProcessor.from_pretrained("m-ric/Aria_hf_2", image_seq_len=2) + processor = AriaProcessor.from_pretrained("m-ric/Aria_hf_2", size_conversion={490: 2, 980: 2}) processor.save_pretrained(cls.tmpdirname) cls.image1 = Image.open( BytesIO( @@ -74,7 +73,14 @@ class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase): cls.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.fake_image_token) cls.global_img_tokens_id = processor.tokenizer(cls.global_img_token, add_special_tokens=False)["input_ids"] cls.padding_token_id = processor.tokenizer.pad_token_id - cls.image_seq_len = 256 + cls.image_seq_len = 2 + + @staticmethod + def prepare_processor_dict(): + return { + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}{% elif message['content'] is iterable %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<|img|>{% endif %}{% endfor %}{% endif %}<|im_end|>\n{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}", + "size_conversion": {490: 2, 980: 2}, + } # fmt: skip def get_tokenizer(self, **kwargs): return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer @@ -89,24 +95,6 @@ class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase): def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) - def test_kwargs_overrides_default_image_processor_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - processor_components = self.prepare_components() - processor_components["image_processor"] = self.get_component( - "image_processor", do_rescale=True, rescale_factor=1 - ) - processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") - - processor = self.processor_class(**processor_components) - self.skip_processor_without_typed_kwargs(processor) - - input_str = self.prepare_text_inputs() - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input, return_tensors="pt") - self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) - def test_process_interleaved_images_prompts_image_splitting(self): processor = self.get_processor() processor.image_processor.split_image = True @@ -236,155 +224,50 @@ And who is that?<|im_end|> """ self.assertEqual(rendered, expected_rendered) - # Override as AriaProcessor needs image tokens in prompts - def prepare_text_inputs(self, batch_size: Optional[int] = None): - if batch_size is None: - return "lower newer <|img|>" + def test_image_chat_template_accepts_processing_kwargs(self): + processor = self.get_processor() + if processor.chat_template is None: + self.skipTest("Processor has no chat template") - if batch_size < 1: - raise ValueError("batch_size must be greater than 0") + messages = [ + [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is shown in this image?"}, + ], + }, + ] + ] - if batch_size == 1: - return ["lower newer <|img|>"] - return ["lower newer <|img|>", "<|img|> upper older longer string"] + ["<|img|> lower newer"] * ( - batch_size - 2 - ) - - # Override tests as inputs_ids padded dimension is the second one but not the last one - @require_vision - @require_torch - def test_kwargs_overrides_default_tokenizer_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer", max_length=30) - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - input_str = self.prepare_text_inputs() - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=30) - self.assertEqual(len(inputs["input_ids"][0]), 30) - - @require_torch - @require_vision - def test_structured_kwargs_nested(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = self.prepare_text_inputs() - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - inputs = processor( - text=input_str, - images=image_input, - common_kwargs={"return_tensors": "pt"}, - images_kwargs={"max_image_size": 980}, - text_kwargs={"padding": "max_length", "max_length": 120, "truncation": "longest_first"}, - ) - self.skip_processor_without_typed_kwargs(processor) - - self.assertEqual(inputs["pixel_values"].shape[3], 980) - - self.assertEqual(len(inputs["input_ids"][0]), 120) - - @require_torch - @require_vision - def test_structured_kwargs_nested_from_dict(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - input_str = self.prepare_text_inputs() - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"max_image_size": 980}, - "text_kwargs": {"padding": "max_length", "max_length": 120, "truncation": "longest_first"}, - } - - inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEqual(inputs["pixel_values"].shape[3], 980) - self.assertEqual(len(inputs["input_ids"][0]), 120) - - @require_vision - @require_torch - def test_tokenizer_defaults_preserved_by_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer", max_length=30) - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - input_str = self.prepare_text_inputs() - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input, return_tensors="pt") - self.assertEqual(len(inputs["input_ids"][0]), 30) - - @require_torch - @require_vision - def test_unstructured_kwargs_batched(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = self.prepare_text_inputs(batch_size=2) - image_input = self.prepare_image_inputs(batch_size=2) - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - padding="longest", - max_length=76, - truncation=True, - max_image_size=980, - ) - - self.assertEqual(inputs["pixel_values"].shape[1], 3) - self.assertEqual(inputs["pixel_values"].shape[3], 980) - self.assertEqual(len(inputs["input_ids"][0]), 76) - - @require_torch - @require_vision - def test_unstructured_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = self.prepare_text_inputs() - image_input = self.prepare_image_inputs() - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - max_image_size=980, + formatted_prompt_tokenized = processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, padding="max_length", - max_length=120, - truncation="longest_first", + max_length=50, ) + self.assertEqual(len(formatted_prompt_tokenized[0]), 50) - self.assertEqual(inputs["pixel_values"].shape[3], 980) - self.assertEqual(len(inputs["input_ids"][0]), 120) + formatted_prompt_tokenized = processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + truncation=True, + max_length=5, + ) + self.assertEqual(len(formatted_prompt_tokenized[0]), 5) + + # Now test the ability to return dict + messages[0][0]["content"].append( + {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"} + ) + out_dict = processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + max_image_size=980, + return_tensors="np", + ) + self.assertListEqual(list(out_dict[self.images_input_name].shape), [1, 3, 980, 980]) diff --git a/tests/models/aya_vision/test_processor_aya_vision.py b/tests/models/aya_vision/test_processor_aya_vision.py index 527f83c0bb..9af13eab32 100644 --- a/tests/models/aya_vision/test_processor_aya_vision.py +++ b/tests/models/aya_vision/test_processor_aya_vision.py @@ -15,7 +15,6 @@ import shutil import tempfile import unittest -from typing import Optional from transformers import AutoProcessor, AutoTokenizer, AyaVisionProcessor from transformers.testing_utils import require_read_token, require_torch, require_vision @@ -61,6 +60,7 @@ class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase): **processor_kwargs, ) processor.save_pretrained(cls.tmpdirname) + cls.image_token = processor.image_token @staticmethod def prepare_processor_dict(): @@ -79,20 +79,6 @@ class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase): def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) - # Override as AyaVisionProcessor needs image tokens in prompts - def prepare_text_inputs(self, batch_size: Optional[int] = None): - if batch_size is None: - return "lower newer " - - if batch_size < 1: - raise ValueError("batch_size must be greater than 0") - - if batch_size == 1: - return ["lower newer "] - return ["lower newer ", " upper older longer string"] + [" lower newer"] * ( - batch_size - 2 - ) - @require_torch def test_process_interleaved_images_videos(self): processor = self.get_processor() diff --git a/tests/models/chameleon/test_processor_chameleon.py b/tests/models/chameleon/test_processor_chameleon.py index 2256d1cb0c..890b1f7f69 100644 --- a/tests/models/chameleon/test_processor_chameleon.py +++ b/tests/models/chameleon/test_processor_chameleon.py @@ -40,5 +40,10 @@ class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase): tokenizer = LlamaTokenizer(vocab_file=SAMPLE_VOCAB) tokenizer.pad_token_id = 0 tokenizer.sep_token_id = 1 - processor = cls.processor_class(image_processor=image_processor, tokenizer=tokenizer) + processor = cls.processor_class(image_processor=image_processor, tokenizer=tokenizer, image_seq_length=2) processor.save_pretrained(cls.tmpdirname) + cls.image_token = processor.image_token + + @staticmethod + def prepare_processor_dict(): + return {"image_seq_length": 2} # fmt: skip diff --git a/tests/models/emu3/test_processor_emu3.py b/tests/models/emu3/test_processor_emu3.py index 90696b17b4..c595a91ee9 100644 --- a/tests/models/emu3/test_processor_emu3.py +++ b/tests/models/emu3/test_processor_emu3.py @@ -34,7 +34,7 @@ class Emu3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): @classmethod def setUpClass(cls): cls.tmpdirname = tempfile.mkdtemp() - image_processor = Emu3ImageProcessor() + image_processor = Emu3ImageProcessor(min_pixels=28 * 28, max_pixels=56 * 56) extra_special_tokens = extra_special_tokens = { "image_token": "", "boi_token": "<|image start|>", @@ -51,8 +51,10 @@ class Emu3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): image_processor=image_processor, tokenizer=tokenizer, chat_template="dummy_template" ) processor.save_pretrained(cls.tmpdirname) + cls.image_token = processor.image_token - def prepare_processor_dict(self): + @staticmethod + def prepare_processor_dict(): return { "chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}", } # fmt: skip diff --git a/tests/models/fuyu/test_processor_fuyu.py b/tests/models/fuyu/test_processor_fuyu.py index 763e283670..1f2c754bd5 100644 --- a/tests/models/fuyu/test_processor_fuyu.py +++ b/tests/models/fuyu/test_processor_fuyu.py @@ -332,7 +332,7 @@ class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase): max_length=76, ) - self.assertEqual(len(inputs["input_ids"][0]), 6) + self.assertEqual(len(inputs["input_ids"][0]), 7) @require_torch diff --git a/tests/models/gemma3/test_processing_gemma3.py b/tests/models/gemma3/test_processing_gemma3.py index e583ca6db2..a2290c9928 100644 --- a/tests/models/gemma3/test_processing_gemma3.py +++ b/tests/models/gemma3/test_processing_gemma3.py @@ -56,6 +56,7 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_kwargs = cls.prepare_processor_dict() processor = Gemma3Processor(image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs) processor.save_pretrained(cls.tmpdirname) + cls.image_token = processor.boi_token @classmethod def tearDownClass(cls): @@ -68,20 +69,6 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'model\n'}}\n{%- endif -%}\n", "image_seq_length": 3, } # fmt: skip - # Override as VLMs need image tokens in prompts - def prepare_text_inputs(self, batch_size: Optional[int] = None): - if batch_size is None: - return "lower newer " - - if batch_size < 1: - raise ValueError("batch_size must be greater than 0") - - if batch_size == 1: - return ["lower newer "] - return ["lower newer ", " upper older longer string"] + [ - " lower newer" - ] * (batch_size - 2) - # Override as Gemma3 needs images to be an explicitly nested batch def prepare_image_inputs(self, batch_size: Optional[int] = None): """This function prepares a list of PIL images for testing""" @@ -123,7 +110,7 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_kwargs = self.prepare_processor_dict() processor = self.processor_class(**processor_components, **processor_kwargs) - input_str = self.prepare_text_inputs() + input_str = self.prepare_text_inputs(modality="image") image_input = self.prepare_image_inputs() inputs = processor( text=input_str, diff --git a/tests/models/got_ocr2/test_processor_got_ocr2.py b/tests/models/got_ocr2/test_processor_got_ocr2.py index 3e7e7cb054..0719d211dd 100644 --- a/tests/models/got_ocr2/test_processor_got_ocr2.py +++ b/tests/models/got_ocr2/test_processor_got_ocr2.py @@ -40,6 +40,7 @@ class GotOcr2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_kwargs = {} processor = GotOcr2Processor(image_processor, tokenizer, **processor_kwargs) processor.save_pretrained(cls.tmpdirname) + cls.image_token = processor.img_pad_token def get_tokenizer(self, **kwargs): return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py index 0b0174de45..35b77c39f2 100644 --- a/tests/models/grounding_dino/test_processor_grounding_dino.py +++ b/tests/models/grounding_dino/test_processor_grounding_dino.py @@ -79,7 +79,7 @@ class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase): cls.embed_dim = 5 cls.seq_length = 5 - def prepare_text_inputs(self, batch_size: Optional[int] = None): + def prepare_text_inputs(self, batch_size: Optional[int] = None, modality: Optional[str] = None): labels = ["a cat", "remote control"] labels_longer = ["a person", "a car", "a dog", "a cat"] diff --git a/tests/models/idefics/test_processor_idefics.py b/tests/models/idefics/test_processor_idefics.py index e161549166..483d1ad1e9 100644 --- a/tests/models/idefics/test_processor_idefics.py +++ b/tests/models/idefics/test_processor_idefics.py @@ -219,139 +219,3 @@ class IdeficsProcessorTest(ProcessorTesterMixin, unittest.TestCase): # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask'] self.assertSetEqual(set(inputs.keys()), set(self.input_keys)) - - # Override the following tests as Idefics image processor does not accept do_rescale and rescale_factor - @require_torch - @require_vision - def test_image_processor_defaults_preserved_by_image_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", image_size=234) - tokenizer = self.get_component("tokenizer", max_length=117) - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = self.prepare_text_inputs() - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - self.assertEqual(len(inputs["pixel_values"][0][0][0]), 234) - - @require_torch - @require_vision - def test_kwargs_overrides_default_image_processor_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", image_size=234) - tokenizer = self.get_component("tokenizer", max_length=117) - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = self.prepare_text_inputs() - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input, image_size=224) - self.assertEqual(len(inputs["pixel_values"][0][0][0]), 224) - - @require_torch - @require_vision - def test_unstructured_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = self.prepare_text_inputs() - image_input = self.prepare_image_inputs() - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - image_size=214, - padding="max_length", - max_length=76, - ) - - self.assertEqual(inputs["pixel_values"].shape[3], 214) - self.assertEqual(len(inputs["input_ids"][0]), 76) - - @require_torch - @require_vision - def test_unstructured_kwargs_batched(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = self.prepare_text_inputs(batch_size=2) - image_input = self.prepare_image_inputs(batch_size=2) - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - image_size=214, - padding="longest", - max_length=76, - ) - - self.assertEqual(inputs["pixel_values"].shape[3], 214) - self.assertEqual(len(inputs["input_ids"][0]), 8) - - @require_torch - @require_vision - def test_structured_kwargs_nested(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = self.prepare_text_inputs() - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"image_size": 214}, - "text_kwargs": {"padding": "max_length", "max_length": 76}, - } - - inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.skip_processor_without_typed_kwargs(processor) - self.assertEqual(inputs["pixel_values"].shape[3], 214) - self.assertEqual(len(inputs["input_ids"][0]), 76) - - @require_torch - @require_vision - def test_structured_kwargs_nested_from_dict(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - input_str = self.prepare_text_inputs() - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"image_size": 214}, - "text_kwargs": {"padding": "max_length", "max_length": 76}, - } - - inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEqual(inputs["pixel_values"].shape[3], 214) - self.assertEqual(len(inputs["input_ids"][0]), 76) diff --git a/tests/models/idefics2/test_processor_idefics2.py b/tests/models/idefics2/test_processor_idefics2.py index 99373005c9..f2f06af707 100644 --- a/tests/models/idefics2/test_processor_idefics2.py +++ b/tests/models/idefics2/test_processor_idefics2.py @@ -16,7 +16,6 @@ import shutil import tempfile import unittest from io import BytesIO -from typing import Optional import requests @@ -84,6 +83,10 @@ class Idefics2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): def get_processor(self, **kwargs): return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) + @staticmethod + def prepare_processor_dict(): + return {"image_seq_len": 2} + @classmethod def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) @@ -329,17 +332,3 @@ class Idefics2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): "Assistant:" ) self.assertEqual(rendered, expected_rendered) - - # Override as Idefics2Processor needs image tokens in prompts - def prepare_text_inputs(self, batch_size: Optional[int] = None): - if batch_size is None: - return "lower newer " - - if batch_size < 1: - raise ValueError("batch_size must be greater than 0") - - if batch_size == 1: - return ["lower newer "] - return ["lower newer ", " upper older longer string"] + [" lower newer"] * ( - batch_size - 2 - ) diff --git a/tests/models/idefics3/test_processor_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py index 5ff0eff946..ad8a24a5a1 100644 --- a/tests/models/idefics3/test_processor_idefics3.py +++ b/tests/models/idefics3/test_processor_idefics3.py @@ -16,7 +16,6 @@ import shutil import tempfile import unittest from io import BytesIO -from typing import Optional import numpy as np import requests @@ -81,6 +80,10 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): def get_processor(self, **kwargs): return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) + @staticmethod + def prepare_processor_dict(): + return {"image_seq_len": 2} + def get_split_image_expected_tokens(self, processor, image_rows, image_cols): text_split_images = [] for n_h in range(image_rows): @@ -352,159 +355,6 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): ) self.assertEqual(rendered, expected_rendered) - # Override as Idefics3Processor needs image tokens in prompts - def prepare_text_inputs(self, batch_size: Optional[int] = None): - if batch_size is None: - return "lower newer " - - if batch_size < 1: - raise ValueError("batch_size must be greater than 0") - - if batch_size == 1: - return ["lower newer "] - return ["lower newer ", " upper older longer string"] + [" lower newer"] * ( - batch_size - 2 - ) - - # Override tests as inputs_ids padded dimension is the second one but not the last one - @require_vision - @require_torch - def test_kwargs_overrides_default_tokenizer_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer", max_length=30) - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - input_str = self.prepare_text_inputs() - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=30) - self.assertEqual(len(inputs["input_ids"][0]), 30) - - @require_torch - @require_vision - def test_structured_kwargs_nested(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = self.prepare_text_inputs() - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - inputs = processor( - text=input_str, - images=image_input, - common_kwargs={"return_tensors": "pt"}, - images_kwargs={"max_image_size": {"longest_edge": 32}}, - text_kwargs={"padding": "max_length", "max_length": 120, "truncation": "longest_first"}, - ) - self.skip_processor_without_typed_kwargs(processor) - - self.assertEqual(inputs["pixel_values"].shape[3], 32) - - self.assertEqual(len(inputs["input_ids"][0]), 120) - - @require_torch - @require_vision - def test_structured_kwargs_nested_from_dict(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - input_str = self.prepare_text_inputs() - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"max_image_size": {"longest_edge": 32}}, - "text_kwargs": {"padding": "max_length", "max_length": 120, "truncation": "longest_first"}, - } - - inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEqual(inputs["pixel_values"].shape[3], 32) - self.assertEqual(len(inputs["input_ids"][0]), 120) - - @require_vision - @require_torch - def test_tokenizer_defaults_preserved_by_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer", max_length=30) - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - input_str = self.prepare_text_inputs() - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input, return_tensors="pt") - self.assertEqual(len(inputs["input_ids"][0]), 30) - - @require_torch - @require_vision - def test_unstructured_kwargs_batched(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = self.prepare_text_inputs(batch_size=2) - image_input = self.prepare_image_inputs(batch_size=2) - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - padding="longest", - max_length=76, - truncation=True, - max_image_size={"longest_edge": 30}, - ) - - self.assertEqual(inputs["pixel_values"].shape[2], 3) - self.assertEqual(inputs["pixel_values"].shape[3], 30) - self.assertEqual(len(inputs["input_ids"][0]), 76) - - @require_torch - @require_vision - def test_unstructured_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = self.prepare_text_inputs() - image_input = self.prepare_image_inputs() - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - max_image_size={"longest_edge": 32}, - padding="max_length", - max_length=120, - truncation="longest_first", - ) - - self.assertEqual(inputs["pixel_values"].shape[3], 32) - self.assertEqual(len(inputs["input_ids"][0]), 120) - @require_torch @require_vision def test_text_only_inference(self): diff --git a/tests/models/llama4/test_processor_llama4.py b/tests/models/llama4/test_processor_llama4.py index 8d4f87b104..aef3539a37 100644 --- a/tests/models/llama4/test_processor_llama4.py +++ b/tests/models/llama4/test_processor_llama4.py @@ -15,7 +15,6 @@ import shutil import tempfile import unittest -from typing import Optional from transformers import AutoProcessor, Llama4Processor, PreTrainedTokenizerFast from transformers.testing_utils import require_vision @@ -38,9 +37,10 @@ class Llama4ProcessorTest(ProcessorTesterMixin, unittest.TestCase): image_processor = Llama4ImageProcessorFast(max_patches=1, size={"height": 20, "width": 20}) tokenizer = PreTrainedTokenizerFast.from_pretrained("unsloth/Llama-3.2-11B-Vision-Instruct-unsloth-bnb-4bit") - processor_kwargs = {} + processor_kwargs = cls.prepare_processor_dict() processor = Llama4Processor(image_processor, tokenizer, **processor_kwargs) processor.save_pretrained(cls.tmpdirname) + cls.image_token = processor.image_token def get_tokenizer(self, **kwargs): return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer @@ -51,21 +51,3 @@ class Llama4ProcessorTest(ProcessorTesterMixin, unittest.TestCase): @classmethod def tearDownClass(cls): shutil.rmtree(cls.tmpdirname) - - # Override as Llama4Processor needs image tokens in prompts - def prepare_text_inputs(self, batch_size: Optional[int] = None): - if batch_size is None: - return "lower newer <|image|>" - - if batch_size < 1: - raise ValueError("batch_size must be greater than 0") - - if batch_size == 1: - return ["lower newer <|image|>"] - return ["lower newer <|image|>", "<|image|> upper older longer string"] + ["<|image|> lower newer"] * ( - batch_size - 2 - ) - - @unittest.skip("This test uses return_tensors='np' which is not supported") - def test_image_chat_template_accepts_processing_kwargs(self): - pass diff --git a/tests/models/llava/test_processor_llava.py b/tests/models/llava/test_processor_llava.py index 637afebeb2..3a469d76f2 100644 --- a/tests/models/llava/test_processor_llava.py +++ b/tests/models/llava/test_processor_llava.py @@ -43,6 +43,7 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_kwargs = cls.prepare_processor_dict() processor = LlavaProcessor(image_processor, tokenizer, **processor_kwargs) processor.save_pretrained(cls.tmpdirname) + cls.image_token = processor.image_token def get_tokenizer(self, **kwargs): return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer @@ -58,18 +59,10 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase): def prepare_processor_dict(): return { "chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}", - "patch_size": 3, + "patch_size": 128, "vision_feature_select_strategy": "default" } # fmt: skip - @unittest.skip( - "Skip because the model has no processor kwargs except for chat template and" - "chat template is saved as a separate file. Stop skipping this test when the processor" - "has new kwargs saved in config file." - ) - def test_processor_to_json_string(self): - pass - def test_chat_template_is_saved(self): processor_loaded = self.processor_class.from_pretrained(self.tmpdirname) processor_dict_loaded = json.loads(processor_loaded.to_json_string()) diff --git a/tests/models/llava_next/test_processor_llava_next.py b/tests/models/llava_next/test_processor_llava_next.py index e8860a4335..47fbb241ac 100644 --- a/tests/models/llava_next/test_processor_llava_next.py +++ b/tests/models/llava_next/test_processor_llava_next.py @@ -43,6 +43,7 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_kwargs = cls.prepare_processor_dict() processor = LlavaNextProcessor(image_processor, tokenizer, **processor_kwargs) processor.save_pretrained(cls.tmpdirname) + cls.image_token = processor.image_token def get_tokenizer(self, **kwargs): return LlavaNextProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer @@ -54,18 +55,10 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase): def prepare_processor_dict(): return { "chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}", - "patch_size": 3, + "patch_size": 128, "vision_feature_select_strategy": "default" } # fmt: skip - @unittest.skip( - "Skip because the model has no processor kwargs except for chat template and" - "chat template is saved as a separate file. Stop skipping this test when the processor" - "has new kwargs saved in config file." - ) - def test_processor_to_json_string(self): - pass - # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved def test_chat_template_is_saved(self): processor_loaded = self.processor_class.from_pretrained(self.tmpdirname) diff --git a/tests/models/llava_next_video/test_processor_llava_next_video.py b/tests/models/llava_next_video/test_processor_llava_next_video.py index f74bbab01a..207d1a6372 100644 --- a/tests/models/llava_next_video/test_processor_llava_next_video.py +++ b/tests/models/llava_next_video/test_processor_llava_next_video.py @@ -47,6 +47,8 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase): video_processor=video_processor, image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs ) processor.save_pretrained(cls.tmpdirname) + cls.image_token = processor.image_token + cls.video_token = processor.video_token def get_tokenizer(self, **kwargs): return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer @@ -61,20 +63,11 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase): def prepare_processor_dict(cls): return { "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + ' '}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '' }}{% endfor %}{# Render all video then #}{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ '