[processor] clean up mulitmodal tests (#37362)

* clkea up mulitmodal processor tests

* fixup

* fix tests

* fix one last test

* forgot
This commit is contained in:
Raushan Turganbay
2025-04-11 13:32:19 +02:00
committed by GitHub
parent 3c39c07939
commit a563999a02
30 changed files with 304 additions and 817 deletions

View File

@@ -31,12 +31,16 @@ from ...image_utils import (
PILImageResampling, PILImageResampling,
get_image_size, get_image_size,
infer_channel_dimension_format, infer_channel_dimension_format,
is_scaled_image,
make_flat_list_of_images, make_flat_list_of_images,
to_numpy_array, to_numpy_array,
valid_images, valid_images,
validate_preprocess_arguments, validate_preprocess_arguments,
) )
from ...utils import TensorType from ...utils import TensorType, logging
logger = logging.get_logger(__name__)
def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]: def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]:
@@ -104,6 +108,12 @@ class AriaImageProcessor(BaseImageProcessor):
Whether to split the image. Whether to split the image.
do_convert_rgb (`bool`, *optional*, defaults to `True`): do_convert_rgb (`bool`, *optional*, defaults to `True`):
Whether to convert the image to RGB. Whether to convert the image to RGB.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
method.
do_normalize (`bool`, *optional*, defaults to `True`): do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Whether to normalize the image.
resample (PILImageResampling, *optional*, defaults to `BICUBIC`): resample (PILImageResampling, *optional*, defaults to `BICUBIC`):
@@ -121,6 +131,8 @@ class AriaImageProcessor(BaseImageProcessor):
split_resolutions: Optional[List[Tuple[int, int]]] = None, split_resolutions: Optional[List[Tuple[int, int]]] = None,
split_image: Optional[bool] = False, split_image: Optional[bool] = False,
do_convert_rgb: Optional[bool] = True, do_convert_rgb: Optional[bool] = True,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: Optional[bool] = True, do_normalize: Optional[bool] = True,
resample: PILImageResampling = PILImageResampling.BICUBIC, resample: PILImageResampling = PILImageResampling.BICUBIC,
**kwargs, **kwargs,
@@ -141,6 +153,8 @@ class AriaImageProcessor(BaseImageProcessor):
split_resolutions = [(el[0] * 490, el[1] * 490) for el in split_resolutions] split_resolutions = [(el[0] * 490, el[1] * 490) for el in split_resolutions]
self.split_resolutions = split_resolutions self.split_resolutions = split_resolutions
self.do_convert_rgb = do_convert_rgb self.do_convert_rgb = do_convert_rgb
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize self.do_normalize = do_normalize
self.resample = resample self.resample = resample
@@ -153,6 +167,8 @@ class AriaImageProcessor(BaseImageProcessor):
min_image_size: Optional[int] = None, min_image_size: Optional[int] = None,
split_image: Optional[bool] = None, split_image: Optional[bool] = None,
do_convert_rgb: Optional[bool] = None, do_convert_rgb: Optional[bool] = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None, do_normalize: Optional[bool] = None,
resample: PILImageResampling = None, resample: PILImageResampling = None,
return_tensors: Optional[Union[str, TensorType]] = "pt", return_tensors: Optional[Union[str, TensorType]] = "pt",
@@ -177,6 +193,10 @@ class AriaImageProcessor(BaseImageProcessor):
Whether to split the image. Whether to split the image.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb` (True)): do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb` (True)):
Whether to convert the image to RGB. Whether to convert the image to RGB.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize` (True)): do_normalize (`bool`, *optional*, defaults to `self.do_normalize` (True)):
Whether to normalize the image. Whether to normalize the image.
resample (PILImageResampling, *optional*, defaults to `self.resample` (BICUBIC)): resample (PILImageResampling, *optional*, defaults to `self.resample` (BICUBIC)):
@@ -217,6 +237,8 @@ class AriaImageProcessor(BaseImageProcessor):
min_image_size = min_image_size if min_image_size is not None else self.min_image_size min_image_size = min_image_size if min_image_size is not None else self.min_image_size
split_image = split_image if split_image is not None else self.split_image split_image = split_image if split_image is not None else self.split_image
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize do_normalize = do_normalize if do_normalize is not None else self.do_normalize
resample = resample if resample is not None else self.resample resample = resample if resample is not None else self.resample
@@ -236,6 +258,8 @@ class AriaImageProcessor(BaseImageProcessor):
image_mean=image_mean, image_mean=image_mean,
image_std=image_std, image_std=image_std,
resample=resample, resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
) )
if do_convert_rgb: if do_convert_rgb:
@@ -244,6 +268,12 @@ class AriaImageProcessor(BaseImageProcessor):
# All transformations expect numpy arrays. # All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images] images = [to_numpy_array(image) for image in images]
if do_rescale and is_scaled_image(images[0]):
logger.warning_once(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
if input_data_format is None: if input_data_format is None:
# We assume that all images have the same channel dimension format. # We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0]) input_data_format = infer_channel_dimension_format(images[0])
@@ -297,9 +327,14 @@ class AriaImageProcessor(BaseImageProcessor):
pixel_mask[: new_size[0], : new_size[1]] = 1 pixel_mask[: new_size[0], : new_size[1]] = 1
pixel_masks.append(pixel_mask) pixel_masks.append(pixel_mask)
if do_rescale:
crop_image_padded = self.rescale(
image=crop_image_padded, scale=rescale_factor, input_data_format=input_data_format
)
if do_normalize: if do_normalize:
crop_image_padded = self.normalize( crop_image_padded = self.normalize(
crop_image_padded / 255.0, crop_image_padded,
self.image_mean, self.image_mean,
self.image_std, self.image_std,
data_format=input_data_format, data_format=input_data_format,

View File

@@ -28,6 +28,7 @@ from ...image_utils import (
PILImageResampling, PILImageResampling,
get_image_size, get_image_size,
infer_channel_dimension_format, infer_channel_dimension_format,
is_scaled_image,
make_flat_list_of_images, make_flat_list_of_images,
to_numpy_array, to_numpy_array,
valid_images, valid_images,
@@ -495,6 +496,12 @@ class AriaImageProcessor(BaseImageProcessor):
Whether to split the image. Whether to split the image.
do_convert_rgb (`bool`, *optional*, defaults to `True`): do_convert_rgb (`bool`, *optional*, defaults to `True`):
Whether to convert the image to RGB. Whether to convert the image to RGB.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
method.
do_normalize (`bool`, *optional*, defaults to `True`): do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Whether to normalize the image.
resample (PILImageResampling, *optional*, defaults to `BICUBIC`): resample (PILImageResampling, *optional*, defaults to `BICUBIC`):
@@ -512,6 +519,8 @@ class AriaImageProcessor(BaseImageProcessor):
split_resolutions: Optional[List[Tuple[int, int]]] = None, split_resolutions: Optional[List[Tuple[int, int]]] = None,
split_image: Optional[bool] = False, split_image: Optional[bool] = False,
do_convert_rgb: Optional[bool] = True, do_convert_rgb: Optional[bool] = True,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: Optional[bool] = True, do_normalize: Optional[bool] = True,
resample: PILImageResampling = PILImageResampling.BICUBIC, resample: PILImageResampling = PILImageResampling.BICUBIC,
**kwargs, **kwargs,
@@ -532,6 +541,8 @@ class AriaImageProcessor(BaseImageProcessor):
split_resolutions = [(el[0] * 490, el[1] * 490) for el in split_resolutions] split_resolutions = [(el[0] * 490, el[1] * 490) for el in split_resolutions]
self.split_resolutions = split_resolutions self.split_resolutions = split_resolutions
self.do_convert_rgb = do_convert_rgb self.do_convert_rgb = do_convert_rgb
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize self.do_normalize = do_normalize
self.resample = resample self.resample = resample
@@ -544,6 +555,8 @@ class AriaImageProcessor(BaseImageProcessor):
min_image_size: Optional[int] = None, min_image_size: Optional[int] = None,
split_image: Optional[bool] = None, split_image: Optional[bool] = None,
do_convert_rgb: Optional[bool] = None, do_convert_rgb: Optional[bool] = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None, do_normalize: Optional[bool] = None,
resample: PILImageResampling = None, resample: PILImageResampling = None,
return_tensors: Optional[Union[str, TensorType]] = "pt", return_tensors: Optional[Union[str, TensorType]] = "pt",
@@ -568,6 +581,10 @@ class AriaImageProcessor(BaseImageProcessor):
Whether to split the image. Whether to split the image.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb` (True)): do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb` (True)):
Whether to convert the image to RGB. Whether to convert the image to RGB.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize` (True)): do_normalize (`bool`, *optional*, defaults to `self.do_normalize` (True)):
Whether to normalize the image. Whether to normalize the image.
resample (PILImageResampling, *optional*, defaults to `self.resample` (BICUBIC)): resample (PILImageResampling, *optional*, defaults to `self.resample` (BICUBIC)):
@@ -608,6 +625,8 @@ class AriaImageProcessor(BaseImageProcessor):
min_image_size = min_image_size if min_image_size is not None else self.min_image_size min_image_size = min_image_size if min_image_size is not None else self.min_image_size
split_image = split_image if split_image is not None else self.split_image split_image = split_image if split_image is not None else self.split_image
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize do_normalize = do_normalize if do_normalize is not None else self.do_normalize
resample = resample if resample is not None else self.resample resample = resample if resample is not None else self.resample
@@ -627,6 +646,8 @@ class AriaImageProcessor(BaseImageProcessor):
image_mean=image_mean, image_mean=image_mean,
image_std=image_std, image_std=image_std,
resample=resample, resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
) )
if do_convert_rgb: if do_convert_rgb:
@@ -635,6 +656,12 @@ class AriaImageProcessor(BaseImageProcessor):
# All transformations expect numpy arrays. # All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images] images = [to_numpy_array(image) for image in images]
if do_rescale and is_scaled_image(images[0]):
logger.warning_once(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
if input_data_format is None: if input_data_format is None:
# We assume that all images have the same channel dimension format. # We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0]) input_data_format = infer_channel_dimension_format(images[0])
@@ -688,9 +715,14 @@ class AriaImageProcessor(BaseImageProcessor):
pixel_mask[: new_size[0], : new_size[1]] = 1 pixel_mask[: new_size[0], : new_size[1]] = 1
pixel_masks.append(pixel_mask) pixel_masks.append(pixel_mask)
if do_rescale:
crop_image_padded = self.rescale(
image=crop_image_padded, scale=rescale_factor, input_data_format=input_data_format
)
if do_normalize: if do_normalize:
crop_image_padded = self.normalize( crop_image_padded = self.normalize(
crop_image_padded / 255.0, crop_image_padded,
self.image_mean, self.image_mean,
self.image_std, self.image_std,
data_format=input_data_format, data_format=input_data_format,

View File

@@ -118,8 +118,10 @@ class ColPaliProcessor(ProcessorMixin):
tokens_to_add = {"additional_special_tokens": [image_token]} tokens_to_add = {"additional_special_tokens": [image_token]}
tokenizer.add_special_tokens(tokens_to_add) tokenizer.add_special_tokens(tokens_to_add)
self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN) self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
self.image_token = IMAGE_TOKEN
else: else:
self.image_token_id = tokenizer.image_token_id self.image_token_id = tokenizer.image_token_id
self.image_token = tokenizer.image_token
tokenizer.add_tokens(EXTRA_TOKENS) tokenizer.add_tokens(EXTRA_TOKENS)
tokenizer.add_bos_token = False tokenizer.add_bos_token = False

View File

@@ -65,6 +65,12 @@ class IdeficsImageProcessor(BaseImageProcessor):
Can be overridden by the `image_std` parameter in the `preprocess` method. Can be overridden by the `image_std` parameter in the `preprocess` method.
image_num_channels (`int`, *optional*, defaults to 3): image_num_channels (`int`, *optional*, defaults to 3):
Number of image channels. Number of image channels.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
method.
""" """
model_input_names = ["pixel_values"] model_input_names = ["pixel_values"]
@@ -75,14 +81,18 @@ class IdeficsImageProcessor(BaseImageProcessor):
image_mean: Optional[Union[float, List[float]]] = None, image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None,
image_num_channels: Optional[int] = 3, image_num_channels: Optional[int] = 3,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
**kwargs, **kwargs,
) -> None: ) -> None:
super().__init__(**kwargs) super().__init__(**kwargs)
self.image_size = image_size self.image_size = image_size
self.image_num_channels = image_num_channels self.image_num_channels = image_num_channels
self.image_mean = image_mean self.image_mean = image_mean if image_mean is not None else IDEFICS_STANDARD_MEAN
self.image_std = image_std self.image_std = image_std if image_std is not None else IDEFICS_STANDARD_STD
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
def preprocess( def preprocess(
self, self,
@@ -92,6 +102,8 @@ class IdeficsImageProcessor(BaseImageProcessor):
image_mean: Optional[Union[float, List[float]]] = None, image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None,
transform: Callable = None, transform: Callable = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH, return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
**kwargs, **kwargs,
) -> TensorType: ) -> TensorType:
@@ -117,6 +129,12 @@ class IdeficsImageProcessor(BaseImageProcessor):
A custom transform function that accepts a single image can be passed for training. For example, A custom transform function that accepts a single image can be passed for training. For example,
`torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is `torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
assumed - and then a preset of inference-specific transforms will be applied to the images assumed - and then a preset of inference-specific transforms will be applied to the images
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
method.
Returns: Returns:
a PyTorch tensor of the processed images a PyTorch tensor of the processed images
@@ -126,6 +144,8 @@ class IdeficsImageProcessor(BaseImageProcessor):
image_num_channels = image_num_channels if image_num_channels is not None else self.image_num_channels image_num_channels = image_num_channels if image_num_channels is not None else self.image_num_channels
image_mean = image_mean if image_mean is not None else self.image_mean image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std image_std = image_std if image_std is not None else self.image_std
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
size = (image_size, image_size) size = (image_size, image_size)
if isinstance(images, list) and len(images) == 0: if isinstance(images, list) and len(images) == 0:
@@ -160,7 +180,7 @@ class IdeficsImageProcessor(BaseImageProcessor):
# further transforms expect numpy arrays # further transforms expect numpy arrays
images = [to_numpy_array(x) for x in images] images = [to_numpy_array(x) for x in images]
images = [resize(x, size, resample=PILImageResampling.BICUBIC) for x in images] images = [resize(x, size, resample=PILImageResampling.BICUBIC) for x in images]
images = [self.rescale(image=image, scale=1 / 255) for image in images] images = [self.rescale(image=image, scale=rescale_factor) for image in images]
images = [self.normalize(x, mean=image_mean, std=image_std) for x in images] images = [self.normalize(x, mean=image_mean, std=image_std) for x in images]
images = [to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images] images = [to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images]
images = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)["pixel_values"] images = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)["pixel_values"]

View File

@@ -141,8 +141,10 @@ class PaliGemmaProcessor(ProcessorMixin):
tokens_to_add = {"additional_special_tokens": [image_token]} tokens_to_add = {"additional_special_tokens": [image_token]}
tokenizer.add_special_tokens(tokens_to_add) tokenizer.add_special_tokens(tokens_to_add)
self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN) self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
self.image_token = IMAGE_TOKEN
else: else:
self.image_token_id = tokenizer.image_token_id self.image_token_id = tokenizer.image_token_id
self.image_token = tokenizer.image_token
tokenizer.add_tokens(EXTRA_TOKENS) tokenizer.add_tokens(EXTRA_TOKENS)
tokenizer.add_bos_token = False tokenizer.add_bos_token = False

View File

@@ -1086,7 +1086,6 @@ class ProcessorMixin(PushToHubMixin):
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs) processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
processor_dict.update({k: v for k, v in kwargs.items() if k in processor_dict.keys()})
return cls.from_args_and_dict(args, processor_dict, **kwargs) return cls.from_args_and_dict(args, processor_dict, **kwargs)
@classmethod @classmethod

View File

@@ -16,7 +16,6 @@ import shutil
import tempfile import tempfile
import unittest import unittest
from io import BytesIO from io import BytesIO
from typing import Optional
import numpy as np import numpy as np
import requests import requests
@@ -41,7 +40,7 @@ class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.tmpdirname = tempfile.mkdtemp() cls.tmpdirname = tempfile.mkdtemp()
processor = AriaProcessor.from_pretrained("m-ric/Aria_hf_2", image_seq_len=2) processor = AriaProcessor.from_pretrained("m-ric/Aria_hf_2", size_conversion={490: 2, 980: 2})
processor.save_pretrained(cls.tmpdirname) processor.save_pretrained(cls.tmpdirname)
cls.image1 = Image.open( cls.image1 = Image.open(
BytesIO( BytesIO(
@@ -74,7 +73,14 @@ class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
cls.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.fake_image_token) cls.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.fake_image_token)
cls.global_img_tokens_id = processor.tokenizer(cls.global_img_token, add_special_tokens=False)["input_ids"] cls.global_img_tokens_id = processor.tokenizer(cls.global_img_token, add_special_tokens=False)["input_ids"]
cls.padding_token_id = processor.tokenizer.pad_token_id cls.padding_token_id = processor.tokenizer.pad_token_id
cls.image_seq_len = 256 cls.image_seq_len = 2
@staticmethod
def prepare_processor_dict():
return {
"chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}{% elif message['content'] is iterable %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<fim_prefix><|img|><fim_suffix>{% endif %}{% endfor %}{% endif %}<|im_end|>\n{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
"size_conversion": {490: 2, 980: 2},
} # fmt: skip
def get_tokenizer(self, **kwargs): def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -89,24 +95,6 @@ class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
def tearDownClass(cls): def tearDownClass(cls):
shutil.rmtree(cls.tmpdirname, ignore_errors=True) shutil.rmtree(cls.tmpdirname, ignore_errors=True)
def test_kwargs_overrides_default_image_processor_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
processor_components = self.prepare_components()
processor_components["image_processor"] = self.get_component(
"image_processor", do_rescale=True, rescale_factor=1
)
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
processor = self.processor_class(**processor_components)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
def test_process_interleaved_images_prompts_image_splitting(self): def test_process_interleaved_images_prompts_image_splitting(self):
processor = self.get_processor() processor = self.get_processor()
processor.image_processor.split_image = True processor.image_processor.split_image = True
@@ -236,155 +224,50 @@ And who is that?<|im_end|>
""" """
self.assertEqual(rendered, expected_rendered) self.assertEqual(rendered, expected_rendered)
# Override as AriaProcessor needs image tokens in prompts def test_image_chat_template_accepts_processing_kwargs(self):
def prepare_text_inputs(self, batch_size: Optional[int] = None): processor = self.get_processor()
if batch_size is None: if processor.chat_template is None:
return "lower newer <|img|>" self.skipTest("Processor has no chat template")
if batch_size < 1: messages = [
raise ValueError("batch_size must be greater than 0") [
{
"role": "user",
"content": [
{"type": "text", "text": "What is shown in this image?"},
],
},
]
]
if batch_size == 1: formatted_prompt_tokenized = processor.apply_chat_template(
return ["lower newer <|img|>"] messages,
return ["lower newer <|img|>", "<|img|> upper older longer string"] + ["<|img|> lower newer"] * ( add_generation_prompt=True,
batch_size - 2 tokenize=True,
)
# Override tests as inputs_ids padded dimension is the second one but not the last one
@require_vision
@require_torch
def test_kwargs_overrides_default_tokenizer_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer", max_length=30)
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=30)
self.assertEqual(len(inputs["input_ids"][0]), 30)
@require_torch
@require_vision
def test_structured_kwargs_nested(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
# Define the kwargs for each modality
inputs = processor(
text=input_str,
images=image_input,
common_kwargs={"return_tensors": "pt"},
images_kwargs={"max_image_size": 980},
text_kwargs={"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
)
self.skip_processor_without_typed_kwargs(processor)
self.assertEqual(inputs["pixel_values"].shape[3], 980)
self.assertEqual(len(inputs["input_ids"][0]), 120)
@require_torch
@require_vision
def test_structured_kwargs_nested_from_dict(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"images_kwargs": {"max_image_size": 980},
"text_kwargs": {"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
}
inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.assertEqual(inputs["pixel_values"].shape[3], 980)
self.assertEqual(len(inputs["input_ids"][0]), 120)
@require_vision
@require_torch
def test_tokenizer_defaults_preserved_by_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer", max_length=30)
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertEqual(len(inputs["input_ids"][0]), 30)
@require_torch
@require_vision
def test_unstructured_kwargs_batched(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=2)
image_input = self.prepare_image_inputs(batch_size=2)
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
padding="longest",
max_length=76,
truncation=True,
max_image_size=980,
)
self.assertEqual(inputs["pixel_values"].shape[1], 3)
self.assertEqual(inputs["pixel_values"].shape[3], 980)
self.assertEqual(len(inputs["input_ids"][0]), 76)
@require_torch
@require_vision
def test_unstructured_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
max_image_size=980,
padding="max_length", padding="max_length",
max_length=120, max_length=50,
truncation="longest_first",
) )
self.assertEqual(len(formatted_prompt_tokenized[0]), 50)
self.assertEqual(inputs["pixel_values"].shape[3], 980) formatted_prompt_tokenized = processor.apply_chat_template(
self.assertEqual(len(inputs["input_ids"][0]), 120) messages,
add_generation_prompt=True,
tokenize=True,
truncation=True,
max_length=5,
)
self.assertEqual(len(formatted_prompt_tokenized[0]), 5)
# Now test the ability to return dict
messages[0][0]["content"].append(
{"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
)
out_dict = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
max_image_size=980,
return_tensors="np",
)
self.assertListEqual(list(out_dict[self.images_input_name].shape), [1, 3, 980, 980])

View File

@@ -15,7 +15,6 @@
import shutil import shutil
import tempfile import tempfile
import unittest import unittest
from typing import Optional
from transformers import AutoProcessor, AutoTokenizer, AyaVisionProcessor from transformers import AutoProcessor, AutoTokenizer, AyaVisionProcessor
from transformers.testing_utils import require_read_token, require_torch, require_vision from transformers.testing_utils import require_read_token, require_torch, require_vision
@@ -61,6 +60,7 @@ class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
**processor_kwargs, **processor_kwargs,
) )
processor.save_pretrained(cls.tmpdirname) processor.save_pretrained(cls.tmpdirname)
cls.image_token = processor.image_token
@staticmethod @staticmethod
def prepare_processor_dict(): def prepare_processor_dict():
@@ -79,20 +79,6 @@ class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
def tearDownClass(cls): def tearDownClass(cls):
shutil.rmtree(cls.tmpdirname, ignore_errors=True) shutil.rmtree(cls.tmpdirname, ignore_errors=True)
# Override as AyaVisionProcessor needs image tokens in prompts
def prepare_text_inputs(self, batch_size: Optional[int] = None):
if batch_size is None:
return "lower newer <image>"
if batch_size < 1:
raise ValueError("batch_size must be greater than 0")
if batch_size == 1:
return ["lower newer <image>"]
return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
batch_size - 2
)
@require_torch @require_torch
def test_process_interleaved_images_videos(self): def test_process_interleaved_images_videos(self):
processor = self.get_processor() processor = self.get_processor()

View File

@@ -40,5 +40,10 @@ class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase):
tokenizer = LlamaTokenizer(vocab_file=SAMPLE_VOCAB) tokenizer = LlamaTokenizer(vocab_file=SAMPLE_VOCAB)
tokenizer.pad_token_id = 0 tokenizer.pad_token_id = 0
tokenizer.sep_token_id = 1 tokenizer.sep_token_id = 1
processor = cls.processor_class(image_processor=image_processor, tokenizer=tokenizer) processor = cls.processor_class(image_processor=image_processor, tokenizer=tokenizer, image_seq_length=2)
processor.save_pretrained(cls.tmpdirname) processor.save_pretrained(cls.tmpdirname)
cls.image_token = processor.image_token
@staticmethod
def prepare_processor_dict():
return {"image_seq_length": 2} # fmt: skip

View File

@@ -34,7 +34,7 @@ class Emu3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.tmpdirname = tempfile.mkdtemp() cls.tmpdirname = tempfile.mkdtemp()
image_processor = Emu3ImageProcessor() image_processor = Emu3ImageProcessor(min_pixels=28 * 28, max_pixels=56 * 56)
extra_special_tokens = extra_special_tokens = { extra_special_tokens = extra_special_tokens = {
"image_token": "<image>", "image_token": "<image>",
"boi_token": "<|image start|>", "boi_token": "<|image start|>",
@@ -51,8 +51,10 @@ class Emu3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
image_processor=image_processor, tokenizer=tokenizer, chat_template="dummy_template" image_processor=image_processor, tokenizer=tokenizer, chat_template="dummy_template"
) )
processor.save_pretrained(cls.tmpdirname) processor.save_pretrained(cls.tmpdirname)
cls.image_token = processor.image_token
def prepare_processor_dict(self): @staticmethod
def prepare_processor_dict():
return { return {
"chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}", "chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
} # fmt: skip } # fmt: skip

View File

@@ -332,7 +332,7 @@ class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase):
max_length=76, max_length=76,
) )
self.assertEqual(len(inputs["input_ids"][0]), 6) self.assertEqual(len(inputs["input_ids"][0]), 7)
@require_torch @require_torch

View File

@@ -56,6 +56,7 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_kwargs = cls.prepare_processor_dict() processor_kwargs = cls.prepare_processor_dict()
processor = Gemma3Processor(image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs) processor = Gemma3Processor(image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs)
processor.save_pretrained(cls.tmpdirname) processor.save_pretrained(cls.tmpdirname)
cls.image_token = processor.boi_token
@classmethod @classmethod
def tearDownClass(cls): def tearDownClass(cls):
@@ -68,20 +69,6 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
"chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n", "image_seq_length": 3, "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n", "image_seq_length": 3,
} # fmt: skip } # fmt: skip
# Override as VLMs need image tokens in prompts
def prepare_text_inputs(self, batch_size: Optional[int] = None):
if batch_size is None:
return "lower newer <start_of_image>"
if batch_size < 1:
raise ValueError("batch_size must be greater than 0")
if batch_size == 1:
return ["lower newer <start_of_image>"]
return ["lower newer <start_of_image>", "<start_of_image> upper older longer string"] + [
"<start_of_image> lower newer"
] * (batch_size - 2)
# Override as Gemma3 needs images to be an explicitly nested batch # Override as Gemma3 needs images to be an explicitly nested batch
def prepare_image_inputs(self, batch_size: Optional[int] = None): def prepare_image_inputs(self, batch_size: Optional[int] = None):
"""This function prepares a list of PIL images for testing""" """This function prepares a list of PIL images for testing"""
@@ -123,7 +110,7 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_kwargs = self.prepare_processor_dict() processor_kwargs = self.prepare_processor_dict()
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
input_str = self.prepare_text_inputs() input_str = self.prepare_text_inputs(modality="image")
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
inputs = processor( inputs = processor(
text=input_str, text=input_str,

View File

@@ -40,6 +40,7 @@ class GotOcr2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_kwargs = {} processor_kwargs = {}
processor = GotOcr2Processor(image_processor, tokenizer, **processor_kwargs) processor = GotOcr2Processor(image_processor, tokenizer, **processor_kwargs)
processor.save_pretrained(cls.tmpdirname) processor.save_pretrained(cls.tmpdirname)
cls.image_token = processor.img_pad_token
def get_tokenizer(self, **kwargs): def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer

View File

@@ -79,7 +79,7 @@ class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
cls.embed_dim = 5 cls.embed_dim = 5
cls.seq_length = 5 cls.seq_length = 5
def prepare_text_inputs(self, batch_size: Optional[int] = None): def prepare_text_inputs(self, batch_size: Optional[int] = None, modality: Optional[str] = None):
labels = ["a cat", "remote control"] labels = ["a cat", "remote control"]
labels_longer = ["a person", "a car", "a dog", "a cat"] labels_longer = ["a person", "a car", "a dog", "a cat"]

View File

@@ -219,139 +219,3 @@ class IdeficsProcessorTest(ProcessorTesterMixin, unittest.TestCase):
# For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask'] # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask']
self.assertSetEqual(set(inputs.keys()), set(self.input_keys)) self.assertSetEqual(set(inputs.keys()), set(self.input_keys))
# Override the following tests as Idefics image processor does not accept do_rescale and rescale_factor
@require_torch
@require_vision
def test_image_processor_defaults_preserved_by_image_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor", image_size=234)
tokenizer = self.get_component("tokenizer", max_length=117)
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input)
self.assertEqual(len(inputs["pixel_values"][0][0][0]), 234)
@require_torch
@require_vision
def test_kwargs_overrides_default_image_processor_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor", image_size=234)
tokenizer = self.get_component("tokenizer", max_length=117)
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, image_size=224)
self.assertEqual(len(inputs["pixel_values"][0][0][0]), 224)
@require_torch
@require_vision
def test_unstructured_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
image_size=214,
padding="max_length",
max_length=76,
)
self.assertEqual(inputs["pixel_values"].shape[3], 214)
self.assertEqual(len(inputs["input_ids"][0]), 76)
@require_torch
@require_vision
def test_unstructured_kwargs_batched(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=2)
image_input = self.prepare_image_inputs(batch_size=2)
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
image_size=214,
padding="longest",
max_length=76,
)
self.assertEqual(inputs["pixel_values"].shape[3], 214)
self.assertEqual(len(inputs["input_ids"][0]), 8)
@require_torch
@require_vision
def test_structured_kwargs_nested(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"images_kwargs": {"image_size": 214},
"text_kwargs": {"padding": "max_length", "max_length": 76},
}
inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.skip_processor_without_typed_kwargs(processor)
self.assertEqual(inputs["pixel_values"].shape[3], 214)
self.assertEqual(len(inputs["input_ids"][0]), 76)
@require_torch
@require_vision
def test_structured_kwargs_nested_from_dict(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"images_kwargs": {"image_size": 214},
"text_kwargs": {"padding": "max_length", "max_length": 76},
}
inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.assertEqual(inputs["pixel_values"].shape[3], 214)
self.assertEqual(len(inputs["input_ids"][0]), 76)

View File

@@ -16,7 +16,6 @@ import shutil
import tempfile import tempfile
import unittest import unittest
from io import BytesIO from io import BytesIO
from typing import Optional
import requests import requests
@@ -84,6 +83,10 @@ class Idefics2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
def get_processor(self, **kwargs): def get_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
@staticmethod
def prepare_processor_dict():
return {"image_seq_len": 2}
@classmethod @classmethod
def tearDownClass(cls): def tearDownClass(cls):
shutil.rmtree(cls.tmpdirname, ignore_errors=True) shutil.rmtree(cls.tmpdirname, ignore_errors=True)
@@ -329,17 +332,3 @@ class Idefics2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
"Assistant:" "Assistant:"
) )
self.assertEqual(rendered, expected_rendered) self.assertEqual(rendered, expected_rendered)
# Override as Idefics2Processor needs image tokens in prompts
def prepare_text_inputs(self, batch_size: Optional[int] = None):
if batch_size is None:
return "lower newer <image>"
if batch_size < 1:
raise ValueError("batch_size must be greater than 0")
if batch_size == 1:
return ["lower newer <image>"]
return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
batch_size - 2
)

View File

@@ -16,7 +16,6 @@ import shutil
import tempfile import tempfile
import unittest import unittest
from io import BytesIO from io import BytesIO
from typing import Optional
import numpy as np import numpy as np
import requests import requests
@@ -81,6 +80,10 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
def get_processor(self, **kwargs): def get_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
@staticmethod
def prepare_processor_dict():
return {"image_seq_len": 2}
def get_split_image_expected_tokens(self, processor, image_rows, image_cols): def get_split_image_expected_tokens(self, processor, image_rows, image_cols):
text_split_images = [] text_split_images = []
for n_h in range(image_rows): for n_h in range(image_rows):
@@ -352,159 +355,6 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
) )
self.assertEqual(rendered, expected_rendered) self.assertEqual(rendered, expected_rendered)
# Override as Idefics3Processor needs image tokens in prompts
def prepare_text_inputs(self, batch_size: Optional[int] = None):
if batch_size is None:
return "lower newer <image>"
if batch_size < 1:
raise ValueError("batch_size must be greater than 0")
if batch_size == 1:
return ["lower newer <image>"]
return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
batch_size - 2
)
# Override tests as inputs_ids padded dimension is the second one but not the last one
@require_vision
@require_torch
def test_kwargs_overrides_default_tokenizer_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer", max_length=30)
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=30)
self.assertEqual(len(inputs["input_ids"][0]), 30)
@require_torch
@require_vision
def test_structured_kwargs_nested(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
# Define the kwargs for each modality
inputs = processor(
text=input_str,
images=image_input,
common_kwargs={"return_tensors": "pt"},
images_kwargs={"max_image_size": {"longest_edge": 32}},
text_kwargs={"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
)
self.skip_processor_without_typed_kwargs(processor)
self.assertEqual(inputs["pixel_values"].shape[3], 32)
self.assertEqual(len(inputs["input_ids"][0]), 120)
@require_torch
@require_vision
def test_structured_kwargs_nested_from_dict(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"images_kwargs": {"max_image_size": {"longest_edge": 32}},
"text_kwargs": {"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
}
inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.assertEqual(inputs["pixel_values"].shape[3], 32)
self.assertEqual(len(inputs["input_ids"][0]), 120)
@require_vision
@require_torch
def test_tokenizer_defaults_preserved_by_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer", max_length=30)
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertEqual(len(inputs["input_ids"][0]), 30)
@require_torch
@require_vision
def test_unstructured_kwargs_batched(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=2)
image_input = self.prepare_image_inputs(batch_size=2)
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
padding="longest",
max_length=76,
truncation=True,
max_image_size={"longest_edge": 30},
)
self.assertEqual(inputs["pixel_values"].shape[2], 3)
self.assertEqual(inputs["pixel_values"].shape[3], 30)
self.assertEqual(len(inputs["input_ids"][0]), 76)
@require_torch
@require_vision
def test_unstructured_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
max_image_size={"longest_edge": 32},
padding="max_length",
max_length=120,
truncation="longest_first",
)
self.assertEqual(inputs["pixel_values"].shape[3], 32)
self.assertEqual(len(inputs["input_ids"][0]), 120)
@require_torch @require_torch
@require_vision @require_vision
def test_text_only_inference(self): def test_text_only_inference(self):

View File

@@ -15,7 +15,6 @@
import shutil import shutil
import tempfile import tempfile
import unittest import unittest
from typing import Optional
from transformers import AutoProcessor, Llama4Processor, PreTrainedTokenizerFast from transformers import AutoProcessor, Llama4Processor, PreTrainedTokenizerFast
from transformers.testing_utils import require_vision from transformers.testing_utils import require_vision
@@ -38,9 +37,10 @@ class Llama4ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
image_processor = Llama4ImageProcessorFast(max_patches=1, size={"height": 20, "width": 20}) image_processor = Llama4ImageProcessorFast(max_patches=1, size={"height": 20, "width": 20})
tokenizer = PreTrainedTokenizerFast.from_pretrained("unsloth/Llama-3.2-11B-Vision-Instruct-unsloth-bnb-4bit") tokenizer = PreTrainedTokenizerFast.from_pretrained("unsloth/Llama-3.2-11B-Vision-Instruct-unsloth-bnb-4bit")
processor_kwargs = {} processor_kwargs = cls.prepare_processor_dict()
processor = Llama4Processor(image_processor, tokenizer, **processor_kwargs) processor = Llama4Processor(image_processor, tokenizer, **processor_kwargs)
processor.save_pretrained(cls.tmpdirname) processor.save_pretrained(cls.tmpdirname)
cls.image_token = processor.image_token
def get_tokenizer(self, **kwargs): def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -51,21 +51,3 @@ class Llama4ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
@classmethod @classmethod
def tearDownClass(cls): def tearDownClass(cls):
shutil.rmtree(cls.tmpdirname) shutil.rmtree(cls.tmpdirname)
# Override as Llama4Processor needs image tokens in prompts
def prepare_text_inputs(self, batch_size: Optional[int] = None):
if batch_size is None:
return "lower newer <|image|>"
if batch_size < 1:
raise ValueError("batch_size must be greater than 0")
if batch_size == 1:
return ["lower newer <|image|>"]
return ["lower newer <|image|>", "<|image|> upper older longer string"] + ["<|image|> lower newer"] * (
batch_size - 2
)
@unittest.skip("This test uses return_tensors='np' which is not supported")
def test_image_chat_template_accepts_processing_kwargs(self):
pass

View File

@@ -43,6 +43,7 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_kwargs = cls.prepare_processor_dict() processor_kwargs = cls.prepare_processor_dict()
processor = LlavaProcessor(image_processor, tokenizer, **processor_kwargs) processor = LlavaProcessor(image_processor, tokenizer, **processor_kwargs)
processor.save_pretrained(cls.tmpdirname) processor.save_pretrained(cls.tmpdirname)
cls.image_token = processor.image_token
def get_tokenizer(self, **kwargs): def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -58,18 +59,10 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
def prepare_processor_dict(): def prepare_processor_dict():
return { return {
"chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}", "chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
"patch_size": 3, "patch_size": 128,
"vision_feature_select_strategy": "default" "vision_feature_select_strategy": "default"
} # fmt: skip } # fmt: skip
@unittest.skip(
"Skip because the model has no processor kwargs except for chat template and"
"chat template is saved as a separate file. Stop skipping this test when the processor"
"has new kwargs saved in config file."
)
def test_processor_to_json_string(self):
pass
def test_chat_template_is_saved(self): def test_chat_template_is_saved(self):
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname) processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
processor_dict_loaded = json.loads(processor_loaded.to_json_string()) processor_dict_loaded = json.loads(processor_loaded.to_json_string())

View File

@@ -43,6 +43,7 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_kwargs = cls.prepare_processor_dict() processor_kwargs = cls.prepare_processor_dict()
processor = LlavaNextProcessor(image_processor, tokenizer, **processor_kwargs) processor = LlavaNextProcessor(image_processor, tokenizer, **processor_kwargs)
processor.save_pretrained(cls.tmpdirname) processor.save_pretrained(cls.tmpdirname)
cls.image_token = processor.image_token
def get_tokenizer(self, **kwargs): def get_tokenizer(self, **kwargs):
return LlavaNextProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer return LlavaNextProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -54,18 +55,10 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
def prepare_processor_dict(): def prepare_processor_dict():
return { return {
"chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}", "chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
"patch_size": 3, "patch_size": 128,
"vision_feature_select_strategy": "default" "vision_feature_select_strategy": "default"
} # fmt: skip } # fmt: skip
@unittest.skip(
"Skip because the model has no processor kwargs except for chat template and"
"chat template is saved as a separate file. Stop skipping this test when the processor"
"has new kwargs saved in config file."
)
def test_processor_to_json_string(self):
pass
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
def test_chat_template_is_saved(self): def test_chat_template_is_saved(self):
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname) processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)

View File

@@ -47,6 +47,8 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
video_processor=video_processor, image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs video_processor=video_processor, image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs
) )
processor.save_pretrained(cls.tmpdirname) processor.save_pretrained(cls.tmpdirname)
cls.image_token = processor.image_token
cls.video_token = processor.video_token
def get_tokenizer(self, **kwargs): def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -61,20 +63,11 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
def prepare_processor_dict(cls): def prepare_processor_dict(cls):
return { return {
"chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + ' '}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>' }}{% endfor %}{# Render all video then #}{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ '<video>' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ '\n' + content['text'] }}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ '\n' + content['text'] }}{% endgeneration %}{% endfor %}{% endif %}{{'<|im_end|>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + ' '}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>' }}{% endfor %}{# Render all video then #}{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ '<video>' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ '\n' + content['text'] }}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ '\n' + content['text'] }}{% endgeneration %}{% endfor %}{% endif %}{{'<|im_end|>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
"num_additional_image_tokens": 6, "num_additional_image_tokens": 0,
"patch_size": 4, "patch_size": 128,
"vision_feature_select_strategy": "default", "vision_feature_select_strategy": "default",
} }
def test_processor_to_json_string(self):
processor = self.get_processor()
obj = json.loads(processor.to_json_string())
for key, value in self.prepare_processor_dict().items():
# chat_tempalate are tested as a separate test because they are saved in separate files
if key != "chat_template":
self.assertEqual(obj[key], value)
self.assertEqual(getattr(processor, key, None), value)
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
def test_chat_template_is_saved(self): def test_chat_template_is_saved(self):
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname) processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)

View File

@@ -51,6 +51,8 @@ class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
video_processor=video_processor, image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs video_processor=video_processor, image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs
) )
processor.save_pretrained(cls.tmpdirname) processor.save_pretrained(cls.tmpdirname)
cls.image_token = processor.image_token
cls.video_token = processor.video_token
def get_tokenizer(self, **kwargs): def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -73,15 +75,6 @@ class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
"vision_feature_select_strategy": "default" "vision_feature_select_strategy": "default"
} # fmt: skip } # fmt: skip
def test_processor_to_json_string(self):
processor = self.get_processor()
obj = json.loads(processor.to_json_string())
for key, value in self.prepare_processor_dict().items():
# chat_tempalate are tested as a separate test because they are saved in separate files
if key != "chat_template":
self.assertEqual(obj[key], value)
self.assertEqual(getattr(processor, key, None), value)
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
def test_chat_template_is_saved(self): def test_chat_template_is_saved(self):
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname) processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)

View File

@@ -19,7 +19,7 @@ import unittest
import requests import requests
from transformers import PixtralProcessor from transformers import PixtralProcessor
from transformers.testing_utils import require_read_token, require_vision from transformers.testing_utils import require_vision
from transformers.utils import is_torch_available, is_vision_available from transformers.utils import is_torch_available, is_vision_available
from ...test_processing_common import ProcessorTesterMixin from ...test_processing_common import ProcessorTesterMixin
@@ -34,7 +34,6 @@ if is_vision_available():
@require_vision @require_vision
@require_read_token
class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
"""This tests Pixtral processor with the new `spatial_merge_size` argument in Mistral3.""" """This tests Pixtral processor with the new `spatial_merge_size` argument in Mistral3."""
@@ -49,30 +48,37 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
cls.url_2 = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg" cls.url_2 = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
cls.image_2 = Image.open(requests.get(cls.url_2, stream=True).raw) cls.image_2 = Image.open(requests.get(cls.url_2, stream=True).raw)
def setUp(self): cls.tmpdirname = tempfile.mkdtemp()
self.tmpdirname = tempfile.mkdtemp() cls.addClassCleanup(lambda tempdir=cls.tmpdirname: shutil.rmtree(tempdir))
processor_kwargs = cls.prepare_processor_dict()
processor = PixtralProcessor.from_pretrained( processor = PixtralProcessor.from_pretrained(
"hf-internal-testing/Mistral-Small-3.1-24B-Instruct-2503-only-processor" "hf-internal-testing/Mistral-Small-3.1-24B-Instruct-2503-only-processor", **processor_kwargs
) )
processor.save_pretrained(self.tmpdirname) processor.save_pretrained(cls.tmpdirname)
cls.image_token = processor.image_token
def get_processor(self): def get_processor(self):
return self.processor_class.from_pretrained(self.tmpdirname) return self.processor_class.from_pretrained(self.tmpdirname)
def tearDown(self): @staticmethod
shutil.rmtree(self.tmpdirname) def prepare_processor_dict():
return {
"chat_template": "{%- set today = strftime_now(\"%Y-%m-%d\") %}\n{%- set default_system_message = \"You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\\nYour knowledge base was last updated on 2023-10-01. The current date is \" + today + \".\\n\\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \\\"What are some good restaurants around me?\\\" => \\\"Where are you?\\\" or \\\"When is the next flight to Tokyo\\\" => \\\"Where do you travel from?\\\")\" %}\n\n{{- bos_token }}\n\n{%- if messages[0]['role'] == 'system' %}\n {%- if messages[0] is string %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n {%- else %} \n {%- set system_message = messages[0]['content'][0]['text'] %}\n {%- set loop_messages = messages[1:] %}\n {%- endif %}\n{%- else %}\n {%- set system_message = default_system_message %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n{%- for message in loop_messages %}\n {%- if message['role'] == 'user' %}\n {%- if message['content'] is string %}\n {{- '[INST]' + message['content'] + '[/INST]' }}\n {%- else %}\n {{- '[INST]' }}\n {%- for block in message['content'] %}\n {%- if block['type'] == 'text' %}\n {{- block['text'] }}\n {%- elif block['type'] == 'image' or block['type'] == 'image_url' %}\n {{- '[IMG]' }}\n {%- else %}\n {{- raise_exception('Only text and image blocks are supported in message content!') }}\n {%- endif %}\n {%- endfor %}\n {{- '[/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'system' %}\n {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}\n {%- elif message['role'] == 'assistant' %}\n {%- if message['content'] is string %}\n {{- message['content'] + eos_token }}\n {%- else %}\n {{- message['content'][0]['text'] + eos_token }}\n {%- endif %}\n {%- else %}\n {{- raise_exception('Only user, system and assistant roles are supported!') }}\n {%- endif %}\n{%- endfor %}",
"patch_size": 128,
} # fmt: skip
def test_image_token_filling(self): def test_image_token_filling(self):
processor = self.processor_class.from_pretrained(self.tmpdirname) processor = self.processor_class.from_pretrained(self.tmpdirname)
# Important to check with non square image # Important to check with non square image
image = torch.randint(0, 2, (3, 500, 316)) image = torch.randint(0, 2, (3, 500, 316))
expected_image_tokens = 198 expected_image_tokens = 4
image_token_index = 10 image_token_index = 10
messages = [ messages = [
{ {
"role": "system", "role": "system",
"content": "", "content": [{"type": "text", "text": "You are a helpful assistant."}],
}, },
{ {
"role": "user", "role": "user",
@@ -104,14 +110,14 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertTrue(len(inputs_image["input_ids"]) == 1) self.assertTrue(len(inputs_image["input_ids"]) == 1)
self.assertIsInstance(inputs_image["input_ids"], torch.Tensor) self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor) self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 30])) self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 36]))
# fmt: off # fmt: off
input_ids = inputs_image["input_ids"] input_ids = inputs_image["input_ids"]
self.assertEqual( self.assertEqual(
input_ids[0].tolist(), input_ids[0].tolist(),
# Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:" # Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:"
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058] [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058]
) )
# fmt: on # fmt: on
@@ -121,36 +127,36 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertTrue(len(inputs_url["input_ids"]) == 1) self.assertTrue(len(inputs_url["input_ids"]) == 1)
self.assertIsInstance(inputs_url["input_ids"], torch.Tensor) self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor) self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 30])) self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 36]))
# fmt: off # fmt: off
input_ids = inputs_url["input_ids"] input_ids = inputs_url["input_ids"]
self.assertEqual( self.assertEqual(
input_ids[0].tolist(), input_ids[0].tolist(),
# Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:" # Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:"
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058] [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058]
) )
# fmt: on # fmt: on
# Test passing inputs as a single list # Test passing inputs as a single list
inputs_image = processor(text=prompt_string, images=[self.image_0], return_tensors="pt") inputs_image = processor(text=prompt_string, images=[self.image_0], return_tensors="pt")
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 30])) self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 36]))
# fmt: off # fmt: off
self.assertEqual( self.assertEqual(
inputs_image["input_ids"][0].tolist(), inputs_image["input_ids"][0].tolist(),
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058] [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058]
) )
# fmt: on # fmt: on
# Test as nested single list # Test as nested single list
inputs_image = processor(text=prompt_string, images=[[self.image_0]], return_tensors="pt") inputs_image = processor(text=prompt_string, images=[[self.image_0]], return_tensors="pt")
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 30])) self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 36]))
# fmt: off # fmt: off
self.assertEqual( self.assertEqual(
inputs_image["input_ids"][0].tolist(), inputs_image["input_ids"][0].tolist(),
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058] [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058]
) )
# fmt: on # fmt: on
@@ -168,14 +174,14 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertTrue(len(inputs_image["input_ids"]) == 1) self.assertTrue(len(inputs_image["input_ids"]) == 1)
self.assertIsInstance(inputs_image["input_ids"], torch.Tensor) self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor) self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 30])) self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 36]))
# fmt: off # fmt: off
input_ids = inputs_image["input_ids"] input_ids = inputs_image["input_ids"]
self.assertEqual( self.assertEqual(
input_ids[0].tolist(), input_ids[0].tolist(),
# Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"] # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058] [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
) )
# fmt: on # fmt: on
@@ -185,25 +191,25 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertTrue(len(inputs_url["input_ids"]) == 1) self.assertTrue(len(inputs_url["input_ids"]) == 1)
self.assertIsInstance(inputs_url["input_ids"], torch.Tensor) self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor) self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 30])) self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 36]))
# fmt: off # fmt: off
input_ids = inputs_url["input_ids"] input_ids = inputs_url["input_ids"]
self.assertEqual( self.assertEqual(
input_ids[0].tolist(), input_ids[0].tolist(),
# Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"] # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058] [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
) )
# fmt: on # fmt: on
# Test passing in as a nested list # Test passing in as a nested list
inputs_url = processor(text=prompt_string, images=[[self.image_0, self.image_1]], return_tensors="pt") inputs_url = processor(text=prompt_string, images=[[self.image_0, self.image_1]], return_tensors="pt")
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 30])) self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 36]))
# fmt: off # fmt: off
self.assertEqual( self.assertEqual(
inputs_url["input_ids"][0].tolist(), inputs_url["input_ids"][0].tolist(),
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058] [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
) )
# fmt: on # fmt: on
@@ -226,14 +232,14 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertTrue(len(inputs_image["input_ids"]) == 2) self.assertTrue(len(inputs_image["input_ids"]) == 2)
self.assertIsInstance(inputs_image["input_ids"], torch.Tensor) self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor) self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 30, 30])) self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 36, 36]))
# fmt: off # fmt: off
input_ids = inputs_image["input_ids"] input_ids = inputs_image["input_ids"]
self.assertEqual( self.assertEqual(
input_ids[0].tolist(), input_ids[0].tolist(),
# Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"] # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058] [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
) )
# fmt: on # fmt: on
@@ -243,14 +249,14 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertTrue(len(inputs_url["input_ids"]) == 2) self.assertTrue(len(inputs_url["input_ids"]) == 2)
self.assertIsInstance(inputs_url["input_ids"], torch.Tensor) self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor) self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 30, 30])) self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 36, 36]))
# fmt: off # fmt: off
input_ids = inputs_url["input_ids"] input_ids = inputs_url["input_ids"]
self.assertEqual( self.assertEqual(
input_ids[0].tolist(), input_ids[0].tolist(),
# Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"] # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058] [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
) )
# fmt: on # fmt: on
@@ -258,12 +264,12 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
inputs_image = processor( inputs_image = processor(
text=prompt_string, images=[self.image_0, self.image_1, self.image_2], return_tensors="pt", padding=True text=prompt_string, images=[self.image_0, self.image_1, self.image_2], return_tensors="pt", padding=True
) )
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 30, 30])) self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 36, 36]))
# fmt: off # fmt: off
self.assertEqual( self.assertEqual(
inputs_image["input_ids"][0].tolist(), inputs_image["input_ids"][0].tolist(),
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058] [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
) )
# fmt: on # fmt: on

View File

@@ -16,7 +16,6 @@ import json
import shutil import shutil
import tempfile import tempfile
import unittest import unittest
from typing import Optional
import numpy as np import numpy as np
@@ -333,20 +332,6 @@ class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
processor(text=text, images=None, padding=True) processor(text=text, images=None, padding=True)
# Override as MllamaProcessor needs image tokens in prompts
def prepare_text_inputs(self, batch_size: Optional[int] = None):
if batch_size is None:
return "lower newer <|image|>"
if batch_size < 1:
raise ValueError("batch_size must be greater than 0")
if batch_size == 1:
return ["lower newer <|image|>"]
return ["lower newer <|image|>", "<|image|> upper older longer string"] + ["<|image|> lower newer"] * (
batch_size - 2
)
def test_unstructured_kwargs_batched(self): def test_unstructured_kwargs_batched(self):
# Overriden because Mllama expects images in nested format. For 2 images it can't infer # Overriden because Mllama expects images in nested format. For 2 images it can't infer
# the correct nesting, so we better throw an error # the correct nesting, so we better throw an error
@@ -357,7 +342,7 @@ class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=2) input_str = self.prepare_text_inputs(batch_size=2, modality="image")
image_input = self.prepare_image_inputs(batch_size=2) image_input = self.prepare_image_inputs(batch_size=2)
image_input = [[image_input[0]], [image_input[1]]] image_input = [[image_input[0]], [image_input[1]]]
inputs = processor( inputs = processor(

View File

@@ -37,10 +37,11 @@ class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
def setUpClass(cls): def setUpClass(cls):
cls.tmpdirname = tempfile.mkdtemp() cls.tmpdirname = tempfile.mkdtemp()
image_processor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384") image_processor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
image_processor.image_seq_length = 0 image_processor.image_seq_length = 0 # TODO: raushan fix me in #37342
tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True)
processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer) processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer)
processor.save_pretrained(cls.tmpdirname) processor.save_pretrained(cls.tmpdirname)
cls.image_token = processor.image_token
@classmethod @classmethod
def tearDownClass(cls): def tearDownClass(cls):

View File

@@ -43,8 +43,11 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.tmpdirname = tempfile.mkdtemp() cls.tmpdirname = tempfile.mkdtemp()
processor = Qwen2_5_VLProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", patch_size=4) processor = Qwen2_5_VLProcessor.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", patch_size=4, max_pixels=56 * 56, min_pixels=28 * 28
)
processor.save_pretrained(cls.tmpdirname) processor.save_pretrained(cls.tmpdirname)
cls.image_token = processor.image_token
def get_tokenizer(self, **kwargs): def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -52,8 +55,11 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
def get_image_processor(self, **kwargs): def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
def prepare_processor_dict(self): @staticmethod
return {"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"} # fmt: skip def prepare_processor_dict():
return {
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
} # fmt: skip
@classmethod @classmethod
def tearDownClass(cls): def tearDownClass(cls):
@@ -206,7 +212,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertTrue(input_name in out_dict) self.assertTrue(input_name in out_dict)
self.assertEqual(len(out_dict["input_ids"]), batch_size) self.assertEqual(len(out_dict["input_ids"]), batch_size)
self.assertEqual(len(out_dict["attention_mask"]), batch_size) self.assertEqual(len(out_dict["attention_mask"]), batch_size)
self.assertEqual(len(out_dict[input_name]), batch_size * 19200) self.assertEqual(len(out_dict[input_name]), batch_size * 192)
return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list} return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
for k in out_dict: for k in out_dict:
@@ -261,7 +267,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
num_frames=num_frames, num_frames=num_frames,
) )
self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 115200) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 360)
# Load with `video_fps` arg # Load with `video_fps` arg
video_fps = 1 video_fps = 1
@@ -273,7 +279,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
video_fps=video_fps, video_fps=video_fps,
) )
self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 288000) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 900)
# Load with `video_fps` and `num_frames` args, should raise an error # Load with `video_fps` and `num_frames` args, should raise an error
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
@@ -294,7 +300,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
return_dict=True, return_dict=True,
) )
self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8640000) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 27000)
# Load video as a list of frames (i.e. images). NOTE: each frame should have same size # Load video as a list of frames (i.e. images). NOTE: each frame should have same size
# because we assume they come from one video # because we assume they come from one video
@@ -312,7 +318,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
return_dict=True, return_dict=True,
) )
self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 160)
def test_kwargs_overrides_custom_image_processor_kwargs(self): def test_kwargs_overrides_custom_image_processor_kwargs(self):
processor_components = self.prepare_components() processor_components = self.prepare_components()
@@ -328,7 +334,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt") inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
self.assertEqual(inputs[self.images_input_name].shape[0], 612) self.assertEqual(inputs[self.images_input_name].shape[0], 612)
inputs = processor(text=input_str, images=image_input, return_tensors="pt") inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertEqual(inputs[self.images_input_name].shape[0], 800) self.assertEqual(inputs[self.images_input_name].shape[0], 100)
@require_av @require_av
def test_apply_chat_template_video_special_processing(self): def test_apply_chat_template_video_special_processing(self):
@@ -395,4 +401,4 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
# Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc # Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0] formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
self.assertTrue("Dummy prompt for preprocess testing" in formatted_text) self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1756800) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 21960)

View File

@@ -14,7 +14,6 @@
import shutil import shutil
import tempfile import tempfile
import unittest import unittest
from typing import Optional
from transformers import AutoProcessor, AutoTokenizer, Qwen2AudioProcessor, WhisperFeatureExtractor from transformers import AutoProcessor, AutoTokenizer, Qwen2AudioProcessor, WhisperFeatureExtractor
from transformers.testing_utils import require_torch, require_torchaudio from transformers.testing_utils import require_torch, require_torchaudio
@@ -40,6 +39,7 @@ class Qwen2AudioProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_kwargs = cls.prepare_processor_dict() processor_kwargs = cls.prepare_processor_dict()
processor = Qwen2AudioProcessor.from_pretrained(cls.checkpoint, **processor_kwargs) processor = Qwen2AudioProcessor.from_pretrained(cls.checkpoint, **processor_kwargs)
processor.save_pretrained(cls.tmpdirname) processor.save_pretrained(cls.tmpdirname)
cls.audio_token = processor.audio_token
def get_tokenizer(self, **kwargs): def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -57,20 +57,6 @@ class Qwen2AudioProcessorTest(ProcessorTesterMixin, unittest.TestCase):
"chat_template": "{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or content['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}", "chat_template": "{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or content['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
} }
# Override as Qwen2AudioProcessor needs audio tokens in prompts
def prepare_text_inputs(self, batch_size: Optional[int] = None):
if batch_size is None:
return "lower newer <|AUDIO|>"
if batch_size < 1:
raise ValueError("batch_size must be greater than 0")
if batch_size == 1:
return ["lower newer <|AUDIO|>"]
return ["lower newer <|AUDIO|>", "<|AUDIO|> upper older longer string"] + ["<|AUDIO|> lower newer"] * (
batch_size - 2
)
def test_can_load_various_tokenizers(self): def test_can_load_various_tokenizers(self):
processor = Qwen2AudioProcessor.from_pretrained(self.checkpoint) processor = Qwen2AudioProcessor.from_pretrained(self.checkpoint)
tokenizer = AutoTokenizer.from_pretrained(self.checkpoint) tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)

View File

@@ -43,8 +43,11 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.tmpdirname = tempfile.mkdtemp() cls.tmpdirname = tempfile.mkdtemp()
processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", patch_size=4) processor = Qwen2VLProcessor.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", patch_size=4, max_pixels=56 * 56, min_pixels=28 * 28
)
processor.save_pretrained(cls.tmpdirname) processor.save_pretrained(cls.tmpdirname)
cls.image_token = processor.image_token
def get_tokenizer(self, **kwargs): def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -52,7 +55,8 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
def get_image_processor(self, **kwargs): def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
def prepare_processor_dict(self): @staticmethod
def prepare_processor_dict():
return {"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"} # fmt: skip return {"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"} # fmt: skip
@classmethod @classmethod
@@ -203,7 +207,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertTrue(input_name in out_dict) self.assertTrue(input_name in out_dict)
self.assertEqual(len(out_dict["input_ids"]), batch_size) self.assertEqual(len(out_dict["input_ids"]), batch_size)
self.assertEqual(len(out_dict["attention_mask"]), batch_size) self.assertEqual(len(out_dict["attention_mask"]), batch_size)
self.assertEqual(len(out_dict[input_name]), batch_size * 19200) self.assertEqual(len(out_dict[input_name]), batch_size * 192)
return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list} return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
for k in out_dict: for k in out_dict:
@@ -258,7 +262,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
num_frames=num_frames, num_frames=num_frames,
) )
self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 115200) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 360)
# Load with `video_fps` arg # Load with `video_fps` arg
video_fps = 1 video_fps = 1
@@ -270,7 +274,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
video_fps=video_fps, video_fps=video_fps,
) )
self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 288000) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 900)
# Load with `video_fps` and `num_frames` args, should raise an error # Load with `video_fps` and `num_frames` args, should raise an error
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
@@ -291,7 +295,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
return_dict=True, return_dict=True,
) )
self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8640000) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 27000)
# Load video as a list of frames (i.e. images). NOTE: each frame should have same size # Load video as a list of frames (i.e. images). NOTE: each frame should have same size
# because we assume they come from one video # because we assume they come from one video
@@ -309,7 +313,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
return_dict=True, return_dict=True,
) )
self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 160)
@require_av @require_av
def test_apply_chat_template_video_special_processing(self): def test_apply_chat_template_video_special_processing(self):
@@ -376,7 +380,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
# Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc # Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0] formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
self.assertTrue("Dummy prompt for preprocess testing" in formatted_text) self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1756800) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 21960)
def test_kwargs_overrides_custom_image_processor_kwargs(self): def test_kwargs_overrides_custom_image_processor_kwargs(self):
processor_components = self.prepare_components() processor_components = self.prepare_components()
@@ -390,6 +394,6 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
input_str = self.prepare_text_inputs() input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, return_tensors="pt") inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertEqual(inputs[self.images_input_name].shape[0], 800) self.assertEqual(inputs[self.images_input_name].shape[0], 100)
inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt") inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
self.assertEqual(inputs[self.images_input_name].shape[0], 612) self.assertEqual(inputs[self.images_input_name].shape[0], 612)

View File

@@ -16,7 +16,6 @@ import shutil
import tempfile import tempfile
import unittest import unittest
from io import BytesIO from io import BytesIO
from typing import Optional
import numpy as np import numpy as np
import requests import requests
@@ -42,7 +41,8 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.tmpdirname = tempfile.mkdtemp() cls.tmpdirname = tempfile.mkdtemp()
processor = SmolVLMProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct", image_seq_len=2) processor_kwargs = cls.prepare_processor_dict()
processor = SmolVLMProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct", **processor_kwargs)
processor.save_pretrained(cls.tmpdirname) processor.save_pretrained(cls.tmpdirname)
cls.image1 = Image.open( cls.image1 = Image.open(
BytesIO( BytesIO(
@@ -82,9 +82,10 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
def get_processor(self, **kwargs): def get_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
def prepare_processor_dict(self): @staticmethod
def prepare_processor_dict():
return { return {
"image_seq_len": self.image_seq_len, "image_seq_len": 2,
"chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}", "chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
} }
@@ -426,106 +427,6 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
# NOTE: the last assert checks are removed # NOTE: the last assert checks are removed
# Loading video as a list of frames (i.e. images) is not supported in SmolVLM # Loading video as a list of frames (i.e. images) is not supported in SmolVLM
# Override as SmolVLMProcessor needs image tokens in prompts
def prepare_text_inputs(self, batch_size: Optional[int] = None):
if batch_size is None:
return "lower newer <image>"
if batch_size < 1:
raise ValueError("batch_size must be greater than 0")
if batch_size == 1:
return ["lower newer <image>"]
return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
batch_size - 2
)
# Override tests as inputs_ids padded dimension is the second one but not the last one
@require_vision
@require_torch
def test_kwargs_overrides_default_tokenizer_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer", max_length=30)
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=30)
self.assertEqual(len(inputs["input_ids"][0]), 30)
@require_torch
@require_vision
def test_structured_kwargs_nested(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
# Define the kwargs for each modality
inputs = processor(
text=input_str,
images=image_input,
common_kwargs={"return_tensors": "pt"},
images_kwargs={"max_image_size": {"longest_edge": 32}},
text_kwargs={"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
)
self.skip_processor_without_typed_kwargs(processor)
self.assertEqual(inputs["pixel_values"].shape[3], 32)
self.assertEqual(len(inputs["input_ids"][0]), 120)
@require_torch
@require_vision
def test_structured_kwargs_nested_from_dict(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"images_kwargs": {"max_image_size": {"longest_edge": 32}},
"text_kwargs": {"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
}
inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.assertEqual(inputs["pixel_values"].shape[3], 32)
self.assertEqual(len(inputs["input_ids"][0]), 120)
@require_vision
@require_torch
def test_tokenizer_defaults_preserved_by_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer", max_length=30)
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertEqual(len(inputs["input_ids"][0]), 30)
@require_torch @require_torch
@require_vision @require_vision
def test_unstructured_kwargs_batched(self): def test_unstructured_kwargs_batched(self):
@@ -537,7 +438,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=2) input_str = self.prepare_text_inputs(batch_size=2, modality="image")
image_input = self.prepare_image_inputs(batch_size=2) image_input = self.prepare_image_inputs(batch_size=2)
image_input = [[image_input[0]], [image_input[1]]] image_input = [[image_input[0]], [image_input[1]]]
inputs = processor( inputs = processor(
@@ -554,32 +455,6 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertEqual(inputs["pixel_values"].shape[3], 30) self.assertEqual(inputs["pixel_values"].shape[3], 30)
self.assertEqual(len(inputs["input_ids"][0]), 76) self.assertEqual(len(inputs["input_ids"][0]), 76)
@require_torch
@require_vision
def test_unstructured_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
max_image_size={"longest_edge": 32},
padding="max_length",
max_length=120,
truncation="longest_first",
)
self.assertEqual(inputs["pixel_values"].shape[3], 32)
self.assertEqual(len(inputs["input_ids"][0]), 120)
@require_torch @require_torch
@require_vision @require_vision
def test_text_only_inference(self): def test_text_only_inference(self):

View File

@@ -92,7 +92,8 @@ class ProcessorTesterMixin:
videos_input_name = "pixel_values_videos" videos_input_name = "pixel_values_videos"
audio_input_name = "input_features" audio_input_name = "input_features"
def prepare_processor_dict(self): @staticmethod
def prepare_processor_dict():
return {} return {}
def get_component(self, attribute, **kwargs): def get_component(self, attribute, **kwargs):
@@ -123,18 +124,23 @@ class ProcessorTesterMixin:
processor = self.processor_class(**components, **self.prepare_processor_dict()) processor = self.processor_class(**components, **self.prepare_processor_dict())
return processor return processor
# TODO: raushan unify all these special token LLMs under the general preparation. We can get audio/image token def prepare_text_inputs(self, batch_size: Optional[int] = None, modality: Optional[str] = None):
# from tokenizer, so we can generalize instead of overriding if modality is not None:
def prepare_text_inputs(self, batch_size: Optional[int] = None): special_token_to_add = getattr(self, f"{modality}_token", "")
else:
special_token_to_add = ""
if batch_size is None: if batch_size is None:
return "lower newer" return f"lower newer {special_token_to_add}"
if batch_size < 1: if batch_size < 1:
raise ValueError("batch_size must be greater than 0") raise ValueError("batch_size must be greater than 0")
if batch_size == 1: if batch_size == 1:
return ["lower newer"] return [f"lower newer {special_token_to_add}"]
return ["lower newer", "upper older longer string"] + ["lower newer"] * (batch_size - 2) return [f"lower newer {special_token_to_add}", f" {special_token_to_add} upper older longer string"] + [
f"lower newer {special_token_to_add}"
] * (batch_size - 2)
@require_vision @require_vision
def prepare_image_inputs(self, batch_size: Optional[int] = None): def prepare_image_inputs(self, batch_size: Optional[int] = None):
@@ -159,6 +165,13 @@ class ProcessorTesterMixin:
for key, value in self.prepare_processor_dict().items(): for key, value in self.prepare_processor_dict().items():
# Chat template is saved as a separate file # Chat template is saved as a separate file
if key not in "chat_template": if key not in "chat_template":
# json converts dict keys to str, but some processors force convert back to int when init
if (
isinstance(obj[key], dict)
and isinstance(list(obj[key].keys())[0], str)
and isinstance(list(value.keys())[0], int)
):
obj[key] = {int(k): v for k, v in obj[key].items()}
self.assertEqual(obj[key], value) self.assertEqual(obj[key], value)
self.assertEqual(getattr(processor, key, None), value) self.assertEqual(getattr(processor, key, None), value)
@@ -206,7 +219,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs() input_str = self.prepare_text_inputs(modality="image")
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, return_tensors="pt") inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertEqual(inputs[self.text_input_name].shape[-1], 117) self.assertEqual(inputs[self.text_input_name].shape[-1], 117)
@@ -229,7 +242,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs() input_str = self.prepare_text_inputs(modality="image")
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, return_tensors="pt") inputs = processor(text=input_str, images=image_input, return_tensors="pt")
@@ -244,7 +257,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs() input_str = self.prepare_text_inputs(modality="image")
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
inputs = processor( inputs = processor(
text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
@@ -264,7 +277,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs() input_str = self.prepare_text_inputs(modality="image")
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt") inputs = processor(text=input_str, images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt")
@@ -278,7 +291,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs() input_str = self.prepare_text_inputs(modality="image")
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
inputs = processor( inputs = processor(
text=input_str, text=input_str,
@@ -301,7 +314,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=2) input_str = self.prepare_text_inputs(batch_size=2, modality="image")
image_input = self.prepare_image_inputs(batch_size=2) image_input = self.prepare_image_inputs(batch_size=2)
inputs = processor( inputs = processor(
text=input_str, text=input_str,
@@ -327,7 +340,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = [self.prepare_text_inputs()] input_str = [self.prepare_text_inputs(modality="image")]
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
_ = processor( _ = processor(
@@ -346,7 +359,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs() input_str = self.prepare_text_inputs(modality="image")
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
# Define the kwargs for each modality # Define the kwargs for each modality
@@ -369,7 +382,7 @@ class ProcessorTesterMixin:
processor_kwargs = self.prepare_processor_dict() processor_kwargs = self.prepare_processor_dict()
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs() input_str = self.prepare_text_inputs(modality="image")
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
# Define the kwargs for each modality # Define the kwargs for each modality
@@ -396,7 +409,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs) processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=3) input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
raw_speech = floats_list((3, 1000)) raw_speech = floats_list((3, 1000))
raw_speech = [np.asarray(audio) for audio in raw_speech] raw_speech = [np.asarray(audio) for audio in raw_speech]
inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt") inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt")
@@ -414,7 +427,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs) processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=3) input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
raw_speech = floats_list((3, 1000)) raw_speech = floats_list((3, 1000))
raw_speech = [np.asarray(audio) for audio in raw_speech] raw_speech = [np.asarray(audio) for audio in raw_speech]
inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=300, padding="max_length") inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=300, padding="max_length")
@@ -433,7 +446,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs) processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=3) input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
raw_speech = floats_list((3, 1000)) raw_speech = floats_list((3, 1000))
raw_speech = [np.asarray(audio) for audio in raw_speech] raw_speech = [np.asarray(audio) for audio in raw_speech]
inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=300, padding="max_length") inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=300, padding="max_length")
@@ -452,7 +465,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs) processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=3) input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
raw_speech = floats_list((3, 1000)) raw_speech = floats_list((3, 1000))
raw_speech = [np.asarray(audio) for audio in raw_speech] raw_speech = [np.asarray(audio) for audio in raw_speech]
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
@@ -476,7 +489,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs) processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=3) input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
raw_speech = floats_list((3, 1000)) raw_speech = floats_list((3, 1000))
raw_speech = [np.asarray(audio) for audio in raw_speech] raw_speech = [np.asarray(audio) for audio in raw_speech]
@@ -499,7 +512,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs() input_str = self.prepare_text_inputs(modality="video")
video_input = self.prepare_video_inputs() video_input = self.prepare_video_inputs()
inputs = processor(text=input_str, videos=video_input, return_tensors="pt") inputs = processor(text=input_str, videos=video_input, return_tensors="pt")
self.assertEqual(inputs[self.text_input_name].shape[-1], 117) self.assertEqual(inputs[self.text_input_name].shape[-1], 117)
@@ -522,7 +535,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs() input_str = self.prepare_text_inputs(modality="video")
video_input = self.prepare_video_inputs() video_input = self.prepare_video_inputs()
inputs = processor(text=input_str, videos=video_input, return_tensors="pt") inputs = processor(text=input_str, videos=video_input, return_tensors="pt")
@@ -537,7 +550,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs() input_str = self.prepare_text_inputs(modality="video")
video_input = self.prepare_video_inputs() video_input = self.prepare_video_inputs()
inputs = processor( inputs = processor(
text=input_str, videos=video_input, return_tensors="pt", max_length=112, padding="max_length" text=input_str, videos=video_input, return_tensors="pt", max_length=112, padding="max_length"
@@ -557,7 +570,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs() input_str = self.prepare_text_inputs(modality="video")
video_input = self.prepare_video_inputs() video_input = self.prepare_video_inputs()
inputs = processor(text=input_str, videos=video_input, do_rescale=True, rescale_factor=-1, return_tensors="pt") inputs = processor(text=input_str, videos=video_input, do_rescale=True, rescale_factor=-1, return_tensors="pt")
@@ -571,7 +584,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs() input_str = self.prepare_text_inputs(modality="video")
video_input = self.prepare_video_inputs() video_input = self.prepare_video_inputs()
inputs = processor( inputs = processor(
text=input_str, text=input_str,
@@ -594,7 +607,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=2) input_str = self.prepare_text_inputs(batch_size=2, modality="video")
video_input = self.prepare_video_inputs(batch_size=2) video_input = self.prepare_video_inputs(batch_size=2)
inputs = processor( inputs = processor(
text=input_str, text=input_str,
@@ -620,7 +633,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = [self.prepare_text_inputs()] input_str = [self.prepare_text_inputs(modality="video")]
video_input = self.prepare_video_inputs() video_input = self.prepare_video_inputs()
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
_ = processor( _ = processor(
@@ -639,7 +652,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs() input_str = self.prepare_text_inputs(modality="video")
video_input = self.prepare_video_inputs() video_input = self.prepare_video_inputs()
# Define the kwargs for each modality # Define the kwargs for each modality
@@ -662,7 +675,7 @@ class ProcessorTesterMixin:
processor_kwargs = self.prepare_processor_dict() processor_kwargs = self.prepare_processor_dict()
processor = self.processor_class(**processor_components, **processor_kwargs) processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs() input_str = self.prepare_text_inputs(modality="video")
video_input = self.prepare_video_inputs() video_input = self.prepare_video_inputs()
# Define the kwargs for each modality # Define the kwargs for each modality
@@ -686,7 +699,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(**processor_components) processor = self.processor_class(**processor_components)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs() input_str = self.prepare_text_inputs(modality="image")
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
@@ -713,7 +726,7 @@ class ProcessorTesterMixin:
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs) processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor) self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=3) input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
audio_lengths = [4000, 8000, 16000, 32000] audio_lengths = [4000, 8000, 16000, 32000]
raw_speech = [np.asarray(audio)[:length] for audio, length in zip(floats_list((3, 32_000)), audio_lengths)] raw_speech = [np.asarray(audio)[:length] for audio, length in zip(floats_list((3, 32_000)), audio_lengths)]