[processor] clean up mulitmodal tests (#37362)
* clkea up mulitmodal processor tests * fixup * fix tests * fix one last test * forgot
This commit is contained in:
committed by
GitHub
parent
3c39c07939
commit
a563999a02
@@ -31,12 +31,16 @@ from ...image_utils import (
|
||||
PILImageResampling,
|
||||
get_image_size,
|
||||
infer_channel_dimension_format,
|
||||
is_scaled_image,
|
||||
make_flat_list_of_images,
|
||||
to_numpy_array,
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...utils import TensorType
|
||||
from ...utils import TensorType, logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]:
|
||||
@@ -104,6 +108,12 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
Whether to split the image.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `True`):
|
||||
Whether to convert the image to RGB.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
|
||||
the `preprocess` method.
|
||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
|
||||
method.
|
||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the image.
|
||||
resample (PILImageResampling, *optional*, defaults to `BICUBIC`):
|
||||
@@ -121,6 +131,8 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
split_resolutions: Optional[List[Tuple[int, int]]] = None,
|
||||
split_image: Optional[bool] = False,
|
||||
do_convert_rgb: Optional[bool] = True,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: Union[int, float] = 1 / 255,
|
||||
do_normalize: Optional[bool] = True,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
**kwargs,
|
||||
@@ -141,6 +153,8 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
split_resolutions = [(el[0] * 490, el[1] * 490) for el in split_resolutions]
|
||||
self.split_resolutions = split_resolutions
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_normalize = do_normalize
|
||||
self.resample = resample
|
||||
|
||||
@@ -153,6 +167,8 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
min_image_size: Optional[int] = None,
|
||||
split_image: Optional[bool] = None,
|
||||
do_convert_rgb: Optional[bool] = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
resample: PILImageResampling = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = "pt",
|
||||
@@ -177,6 +193,10 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
Whether to split the image.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb` (True)):
|
||||
Whether to convert the image to RGB.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image.
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize` (True)):
|
||||
Whether to normalize the image.
|
||||
resample (PILImageResampling, *optional*, defaults to `self.resample` (BICUBIC)):
|
||||
@@ -217,6 +237,8 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
min_image_size = min_image_size if min_image_size is not None else self.min_image_size
|
||||
split_image = split_image if split_image is not None else self.split_image
|
||||
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
resample = resample if resample is not None else self.resample
|
||||
|
||||
@@ -236,6 +258,8 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
resample=resample,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
)
|
||||
|
||||
if do_convert_rgb:
|
||||
@@ -244,6 +268,12 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
# All transformations expect numpy arrays.
|
||||
images = [to_numpy_array(image) for image in images]
|
||||
|
||||
if do_rescale and is_scaled_image(images[0]):
|
||||
logger.warning_once(
|
||||
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
||||
)
|
||||
|
||||
if input_data_format is None:
|
||||
# We assume that all images have the same channel dimension format.
|
||||
input_data_format = infer_channel_dimension_format(images[0])
|
||||
@@ -297,9 +327,14 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
pixel_mask[: new_size[0], : new_size[1]] = 1
|
||||
pixel_masks.append(pixel_mask)
|
||||
|
||||
if do_rescale:
|
||||
crop_image_padded = self.rescale(
|
||||
image=crop_image_padded, scale=rescale_factor, input_data_format=input_data_format
|
||||
)
|
||||
|
||||
if do_normalize:
|
||||
crop_image_padded = self.normalize(
|
||||
crop_image_padded / 255.0,
|
||||
crop_image_padded,
|
||||
self.image_mean,
|
||||
self.image_std,
|
||||
data_format=input_data_format,
|
||||
|
||||
@@ -28,6 +28,7 @@ from ...image_utils import (
|
||||
PILImageResampling,
|
||||
get_image_size,
|
||||
infer_channel_dimension_format,
|
||||
is_scaled_image,
|
||||
make_flat_list_of_images,
|
||||
to_numpy_array,
|
||||
valid_images,
|
||||
@@ -495,6 +496,12 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
Whether to split the image.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `True`):
|
||||
Whether to convert the image to RGB.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
|
||||
the `preprocess` method.
|
||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
|
||||
method.
|
||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the image.
|
||||
resample (PILImageResampling, *optional*, defaults to `BICUBIC`):
|
||||
@@ -512,6 +519,8 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
split_resolutions: Optional[List[Tuple[int, int]]] = None,
|
||||
split_image: Optional[bool] = False,
|
||||
do_convert_rgb: Optional[bool] = True,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: Union[int, float] = 1 / 255,
|
||||
do_normalize: Optional[bool] = True,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
**kwargs,
|
||||
@@ -532,6 +541,8 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
split_resolutions = [(el[0] * 490, el[1] * 490) for el in split_resolutions]
|
||||
self.split_resolutions = split_resolutions
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_normalize = do_normalize
|
||||
self.resample = resample
|
||||
|
||||
@@ -544,6 +555,8 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
min_image_size: Optional[int] = None,
|
||||
split_image: Optional[bool] = None,
|
||||
do_convert_rgb: Optional[bool] = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
resample: PILImageResampling = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = "pt",
|
||||
@@ -568,6 +581,10 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
Whether to split the image.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb` (True)):
|
||||
Whether to convert the image to RGB.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image.
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize` (True)):
|
||||
Whether to normalize the image.
|
||||
resample (PILImageResampling, *optional*, defaults to `self.resample` (BICUBIC)):
|
||||
@@ -608,6 +625,8 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
min_image_size = min_image_size if min_image_size is not None else self.min_image_size
|
||||
split_image = split_image if split_image is not None else self.split_image
|
||||
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
resample = resample if resample is not None else self.resample
|
||||
|
||||
@@ -627,6 +646,8 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
resample=resample,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
)
|
||||
|
||||
if do_convert_rgb:
|
||||
@@ -635,6 +656,12 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
# All transformations expect numpy arrays.
|
||||
images = [to_numpy_array(image) for image in images]
|
||||
|
||||
if do_rescale and is_scaled_image(images[0]):
|
||||
logger.warning_once(
|
||||
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
||||
)
|
||||
|
||||
if input_data_format is None:
|
||||
# We assume that all images have the same channel dimension format.
|
||||
input_data_format = infer_channel_dimension_format(images[0])
|
||||
@@ -688,9 +715,14 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
pixel_mask[: new_size[0], : new_size[1]] = 1
|
||||
pixel_masks.append(pixel_mask)
|
||||
|
||||
if do_rescale:
|
||||
crop_image_padded = self.rescale(
|
||||
image=crop_image_padded, scale=rescale_factor, input_data_format=input_data_format
|
||||
)
|
||||
|
||||
if do_normalize:
|
||||
crop_image_padded = self.normalize(
|
||||
crop_image_padded / 255.0,
|
||||
crop_image_padded,
|
||||
self.image_mean,
|
||||
self.image_std,
|
||||
data_format=input_data_format,
|
||||
|
||||
@@ -118,8 +118,10 @@ class ColPaliProcessor(ProcessorMixin):
|
||||
tokens_to_add = {"additional_special_tokens": [image_token]}
|
||||
tokenizer.add_special_tokens(tokens_to_add)
|
||||
self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
|
||||
self.image_token = IMAGE_TOKEN
|
||||
else:
|
||||
self.image_token_id = tokenizer.image_token_id
|
||||
self.image_token = tokenizer.image_token
|
||||
|
||||
tokenizer.add_tokens(EXTRA_TOKENS)
|
||||
tokenizer.add_bos_token = False
|
||||
|
||||
@@ -65,6 +65,12 @@ class IdeficsImageProcessor(BaseImageProcessor):
|
||||
Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||
image_num_channels (`int`, *optional*, defaults to 3):
|
||||
Number of image channels.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
|
||||
the `preprocess` method.
|
||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
|
||||
method.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
@@ -75,14 +81,18 @@ class IdeficsImageProcessor(BaseImageProcessor):
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
image_num_channels: Optional[int] = 3,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: Union[int, float] = 1 / 255,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.image_size = image_size
|
||||
self.image_num_channels = image_num_channels
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.image_mean = image_mean if image_mean is not None else IDEFICS_STANDARD_MEAN
|
||||
self.image_std = image_std if image_std is not None else IDEFICS_STANDARD_STD
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
@@ -92,6 +102,8 @@ class IdeficsImageProcessor(BaseImageProcessor):
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
transform: Callable = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
|
||||
**kwargs,
|
||||
) -> TensorType:
|
||||
@@ -117,6 +129,12 @@ class IdeficsImageProcessor(BaseImageProcessor):
|
||||
A custom transform function that accepts a single image can be passed for training. For example,
|
||||
`torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
|
||||
assumed - and then a preset of inference-specific transforms will be applied to the images
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
|
||||
the `preprocess` method.
|
||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
|
||||
method.
|
||||
|
||||
Returns:
|
||||
a PyTorch tensor of the processed images
|
||||
@@ -126,6 +144,8 @@ class IdeficsImageProcessor(BaseImageProcessor):
|
||||
image_num_channels = image_num_channels if image_num_channels is not None else self.image_num_channels
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
size = (image_size, image_size)
|
||||
|
||||
if isinstance(images, list) and len(images) == 0:
|
||||
@@ -160,7 +180,7 @@ class IdeficsImageProcessor(BaseImageProcessor):
|
||||
# further transforms expect numpy arrays
|
||||
images = [to_numpy_array(x) for x in images]
|
||||
images = [resize(x, size, resample=PILImageResampling.BICUBIC) for x in images]
|
||||
images = [self.rescale(image=image, scale=1 / 255) for image in images]
|
||||
images = [self.rescale(image=image, scale=rescale_factor) for image in images]
|
||||
images = [self.normalize(x, mean=image_mean, std=image_std) for x in images]
|
||||
images = [to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images]
|
||||
images = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)["pixel_values"]
|
||||
|
||||
@@ -141,8 +141,10 @@ class PaliGemmaProcessor(ProcessorMixin):
|
||||
tokens_to_add = {"additional_special_tokens": [image_token]}
|
||||
tokenizer.add_special_tokens(tokens_to_add)
|
||||
self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
|
||||
self.image_token = IMAGE_TOKEN
|
||||
else:
|
||||
self.image_token_id = tokenizer.image_token_id
|
||||
self.image_token = tokenizer.image_token
|
||||
|
||||
tokenizer.add_tokens(EXTRA_TOKENS)
|
||||
tokenizer.add_bos_token = False
|
||||
|
||||
@@ -1086,7 +1086,6 @@ class ProcessorMixin(PushToHubMixin):
|
||||
|
||||
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
|
||||
processor_dict.update({k: v for k, v in kwargs.items() if k in processor_dict.keys()})
|
||||
return cls.from_args_and_dict(args, processor_dict, **kwargs)
|
||||
|
||||
@classmethod
|
||||
|
||||
@@ -16,7 +16,6 @@ import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
from io import BytesIO
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
@@ -41,7 +40,7 @@ class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.tmpdirname = tempfile.mkdtemp()
|
||||
processor = AriaProcessor.from_pretrained("m-ric/Aria_hf_2", image_seq_len=2)
|
||||
processor = AriaProcessor.from_pretrained("m-ric/Aria_hf_2", size_conversion={490: 2, 980: 2})
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
cls.image1 = Image.open(
|
||||
BytesIO(
|
||||
@@ -74,7 +73,14 @@ class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
cls.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.fake_image_token)
|
||||
cls.global_img_tokens_id = processor.tokenizer(cls.global_img_token, add_special_tokens=False)["input_ids"]
|
||||
cls.padding_token_id = processor.tokenizer.pad_token_id
|
||||
cls.image_seq_len = 256
|
||||
cls.image_seq_len = 2
|
||||
|
||||
@staticmethod
|
||||
def prepare_processor_dict():
|
||||
return {
|
||||
"chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}{% elif message['content'] is iterable %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<fim_prefix><|img|><fim_suffix>{% endif %}{% endfor %}{% endif %}<|im_end|>\n{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
|
||||
"size_conversion": {490: 2, 980: 2},
|
||||
} # fmt: skip
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
|
||||
@@ -89,24 +95,6 @@ class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def tearDownClass(cls):
|
||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||
|
||||
def test_kwargs_overrides_default_image_processor_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
processor_components = self.prepare_components()
|
||||
processor_components["image_processor"] = self.get_component(
|
||||
"image_processor", do_rescale=True, rescale_factor=1
|
||||
)
|
||||
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
|
||||
|
||||
processor = self.processor_class(**processor_components)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
|
||||
self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
|
||||
|
||||
def test_process_interleaved_images_prompts_image_splitting(self):
|
||||
processor = self.get_processor()
|
||||
processor.image_processor.split_image = True
|
||||
@@ -236,155 +224,50 @@ And who is that?<|im_end|>
|
||||
"""
|
||||
self.assertEqual(rendered, expected_rendered)
|
||||
|
||||
# Override as AriaProcessor needs image tokens in prompts
|
||||
def prepare_text_inputs(self, batch_size: Optional[int] = None):
|
||||
if batch_size is None:
|
||||
return "lower newer <|img|>"
|
||||
def test_image_chat_template_accepts_processing_kwargs(self):
|
||||
processor = self.get_processor()
|
||||
if processor.chat_template is None:
|
||||
self.skipTest("Processor has no chat template")
|
||||
|
||||
if batch_size < 1:
|
||||
raise ValueError("batch_size must be greater than 0")
|
||||
messages = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What is shown in this image?"},
|
||||
],
|
||||
},
|
||||
]
|
||||
]
|
||||
|
||||
if batch_size == 1:
|
||||
return ["lower newer <|img|>"]
|
||||
return ["lower newer <|img|>", "<|img|> upper older longer string"] + ["<|img|> lower newer"] * (
|
||||
batch_size - 2
|
||||
)
|
||||
|
||||
# Override tests as inputs_ids padded dimension is the second one but not the last one
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_kwargs_overrides_default_tokenizer_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer", max_length=30)
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=30)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 30)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_structured_kwargs_nested(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
# Define the kwargs for each modality
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
images=image_input,
|
||||
common_kwargs={"return_tensors": "pt"},
|
||||
images_kwargs={"max_image_size": 980},
|
||||
text_kwargs={"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
|
||||
)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
self.assertEqual(inputs["pixel_values"].shape[3], 980)
|
||||
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 120)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_structured_kwargs_nested_from_dict(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
# Define the kwargs for each modality
|
||||
all_kwargs = {
|
||||
"common_kwargs": {"return_tensors": "pt"},
|
||||
"images_kwargs": {"max_image_size": 980},
|
||||
"text_kwargs": {"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
|
||||
}
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, **all_kwargs)
|
||||
self.assertEqual(inputs["pixel_values"].shape[3], 980)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 120)
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_tokenizer_defaults_preserved_by_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer", max_length=30)
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 30)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_unstructured_kwargs_batched(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs(batch_size=2)
|
||||
image_input = self.prepare_image_inputs(batch_size=2)
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
images=image_input,
|
||||
return_tensors="pt",
|
||||
padding="longest",
|
||||
max_length=76,
|
||||
truncation=True,
|
||||
max_image_size=980,
|
||||
)
|
||||
|
||||
self.assertEqual(inputs["pixel_values"].shape[1], 3)
|
||||
self.assertEqual(inputs["pixel_values"].shape[3], 980)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_unstructured_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
images=image_input,
|
||||
return_tensors="pt",
|
||||
max_image_size=980,
|
||||
formatted_prompt_tokenized = processor.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
padding="max_length",
|
||||
max_length=120,
|
||||
truncation="longest_first",
|
||||
max_length=50,
|
||||
)
|
||||
self.assertEqual(len(formatted_prompt_tokenized[0]), 50)
|
||||
|
||||
self.assertEqual(inputs["pixel_values"].shape[3], 980)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 120)
|
||||
formatted_prompt_tokenized = processor.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
truncation=True,
|
||||
max_length=5,
|
||||
)
|
||||
self.assertEqual(len(formatted_prompt_tokenized[0]), 5)
|
||||
|
||||
# Now test the ability to return dict
|
||||
messages[0][0]["content"].append(
|
||||
{"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
|
||||
)
|
||||
out_dict = processor.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
max_image_size=980,
|
||||
return_tensors="np",
|
||||
)
|
||||
self.assertListEqual(list(out_dict[self.images_input_name].shape), [1, 3, 980, 980])
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
from typing import Optional
|
||||
|
||||
from transformers import AutoProcessor, AutoTokenizer, AyaVisionProcessor
|
||||
from transformers.testing_utils import require_read_token, require_torch, require_vision
|
||||
@@ -61,6 +60,7 @@ class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
**processor_kwargs,
|
||||
)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
cls.image_token = processor.image_token
|
||||
|
||||
@staticmethod
|
||||
def prepare_processor_dict():
|
||||
@@ -79,20 +79,6 @@ class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def tearDownClass(cls):
|
||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||
|
||||
# Override as AyaVisionProcessor needs image tokens in prompts
|
||||
def prepare_text_inputs(self, batch_size: Optional[int] = None):
|
||||
if batch_size is None:
|
||||
return "lower newer <image>"
|
||||
|
||||
if batch_size < 1:
|
||||
raise ValueError("batch_size must be greater than 0")
|
||||
|
||||
if batch_size == 1:
|
||||
return ["lower newer <image>"]
|
||||
return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
|
||||
batch_size - 2
|
||||
)
|
||||
|
||||
@require_torch
|
||||
def test_process_interleaved_images_videos(self):
|
||||
processor = self.get_processor()
|
||||
|
||||
@@ -40,5 +40,10 @@ class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
tokenizer = LlamaTokenizer(vocab_file=SAMPLE_VOCAB)
|
||||
tokenizer.pad_token_id = 0
|
||||
tokenizer.sep_token_id = 1
|
||||
processor = cls.processor_class(image_processor=image_processor, tokenizer=tokenizer)
|
||||
processor = cls.processor_class(image_processor=image_processor, tokenizer=tokenizer, image_seq_length=2)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
cls.image_token = processor.image_token
|
||||
|
||||
@staticmethod
|
||||
def prepare_processor_dict():
|
||||
return {"image_seq_length": 2} # fmt: skip
|
||||
|
||||
@@ -34,7 +34,7 @@ class Emu3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.tmpdirname = tempfile.mkdtemp()
|
||||
image_processor = Emu3ImageProcessor()
|
||||
image_processor = Emu3ImageProcessor(min_pixels=28 * 28, max_pixels=56 * 56)
|
||||
extra_special_tokens = extra_special_tokens = {
|
||||
"image_token": "<image>",
|
||||
"boi_token": "<|image start|>",
|
||||
@@ -51,8 +51,10 @@ class Emu3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
image_processor=image_processor, tokenizer=tokenizer, chat_template="dummy_template"
|
||||
)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
cls.image_token = processor.image_token
|
||||
|
||||
def prepare_processor_dict(self):
|
||||
@staticmethod
|
||||
def prepare_processor_dict():
|
||||
return {
|
||||
"chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
|
||||
} # fmt: skip
|
||||
|
||||
@@ -332,7 +332,7 @@ class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
max_length=76,
|
||||
)
|
||||
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 6)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 7)
|
||||
|
||||
|
||||
@require_torch
|
||||
|
||||
@@ -56,6 +56,7 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_kwargs = cls.prepare_processor_dict()
|
||||
processor = Gemma3Processor(image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
cls.image_token = processor.boi_token
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
@@ -68,20 +69,6 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
"chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n", "image_seq_length": 3,
|
||||
} # fmt: skip
|
||||
|
||||
# Override as VLMs need image tokens in prompts
|
||||
def prepare_text_inputs(self, batch_size: Optional[int] = None):
|
||||
if batch_size is None:
|
||||
return "lower newer <start_of_image>"
|
||||
|
||||
if batch_size < 1:
|
||||
raise ValueError("batch_size must be greater than 0")
|
||||
|
||||
if batch_size == 1:
|
||||
return ["lower newer <start_of_image>"]
|
||||
return ["lower newer <start_of_image>", "<start_of_image> upper older longer string"] + [
|
||||
"<start_of_image> lower newer"
|
||||
] * (batch_size - 2)
|
||||
|
||||
# Override as Gemma3 needs images to be an explicitly nested batch
|
||||
def prepare_image_inputs(self, batch_size: Optional[int] = None):
|
||||
"""This function prepares a list of PIL images for testing"""
|
||||
@@ -123,7 +110,7 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
input_str = self.prepare_text_inputs(modality="image")
|
||||
image_input = self.prepare_image_inputs()
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
|
||||
@@ -40,6 +40,7 @@ class GotOcr2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_kwargs = {}
|
||||
processor = GotOcr2Processor(image_processor, tokenizer, **processor_kwargs)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
cls.image_token = processor.img_pad_token
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
|
||||
|
||||
@@ -79,7 +79,7 @@ class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
cls.embed_dim = 5
|
||||
cls.seq_length = 5
|
||||
|
||||
def prepare_text_inputs(self, batch_size: Optional[int] = None):
|
||||
def prepare_text_inputs(self, batch_size: Optional[int] = None, modality: Optional[str] = None):
|
||||
labels = ["a cat", "remote control"]
|
||||
labels_longer = ["a person", "a car", "a dog", "a cat"]
|
||||
|
||||
|
||||
@@ -219,139 +219,3 @@ class IdeficsProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
|
||||
# For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask']
|
||||
self.assertSetEqual(set(inputs.keys()), set(self.input_keys))
|
||||
|
||||
# Override the following tests as Idefics image processor does not accept do_rescale and rescale_factor
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_image_processor_defaults_preserved_by_image_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor", image_size=234)
|
||||
tokenizer = self.get_component("tokenizer", max_length=117)
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
inputs = processor(text=input_str, images=image_input)
|
||||
self.assertEqual(len(inputs["pixel_values"][0][0][0]), 234)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_kwargs_overrides_default_image_processor_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor", image_size=234)
|
||||
tokenizer = self.get_component("tokenizer", max_length=117)
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, image_size=224)
|
||||
self.assertEqual(len(inputs["pixel_values"][0][0][0]), 224)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_unstructured_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
images=image_input,
|
||||
return_tensors="pt",
|
||||
image_size=214,
|
||||
padding="max_length",
|
||||
max_length=76,
|
||||
)
|
||||
|
||||
self.assertEqual(inputs["pixel_values"].shape[3], 214)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_unstructured_kwargs_batched(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs(batch_size=2)
|
||||
image_input = self.prepare_image_inputs(batch_size=2)
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
images=image_input,
|
||||
return_tensors="pt",
|
||||
image_size=214,
|
||||
padding="longest",
|
||||
max_length=76,
|
||||
)
|
||||
|
||||
self.assertEqual(inputs["pixel_values"].shape[3], 214)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 8)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_structured_kwargs_nested(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
# Define the kwargs for each modality
|
||||
all_kwargs = {
|
||||
"common_kwargs": {"return_tensors": "pt"},
|
||||
"images_kwargs": {"image_size": 214},
|
||||
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
||||
}
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, **all_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
self.assertEqual(inputs["pixel_values"].shape[3], 214)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_structured_kwargs_nested_from_dict(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
# Define the kwargs for each modality
|
||||
all_kwargs = {
|
||||
"common_kwargs": {"return_tensors": "pt"},
|
||||
"images_kwargs": {"image_size": 214},
|
||||
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
||||
}
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, **all_kwargs)
|
||||
self.assertEqual(inputs["pixel_values"].shape[3], 214)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
|
||||
@@ -16,7 +16,6 @@ import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
from io import BytesIO
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
|
||||
@@ -84,6 +83,10 @@ class Idefics2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def get_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def prepare_processor_dict():
|
||||
return {"image_seq_len": 2}
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||
@@ -329,17 +332,3 @@ class Idefics2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
"Assistant:"
|
||||
)
|
||||
self.assertEqual(rendered, expected_rendered)
|
||||
|
||||
# Override as Idefics2Processor needs image tokens in prompts
|
||||
def prepare_text_inputs(self, batch_size: Optional[int] = None):
|
||||
if batch_size is None:
|
||||
return "lower newer <image>"
|
||||
|
||||
if batch_size < 1:
|
||||
raise ValueError("batch_size must be greater than 0")
|
||||
|
||||
if batch_size == 1:
|
||||
return ["lower newer <image>"]
|
||||
return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
|
||||
batch_size - 2
|
||||
)
|
||||
|
||||
@@ -16,7 +16,6 @@ import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
from io import BytesIO
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
@@ -81,6 +80,10 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def get_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def prepare_processor_dict():
|
||||
return {"image_seq_len": 2}
|
||||
|
||||
def get_split_image_expected_tokens(self, processor, image_rows, image_cols):
|
||||
text_split_images = []
|
||||
for n_h in range(image_rows):
|
||||
@@ -352,159 +355,6 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
)
|
||||
self.assertEqual(rendered, expected_rendered)
|
||||
|
||||
# Override as Idefics3Processor needs image tokens in prompts
|
||||
def prepare_text_inputs(self, batch_size: Optional[int] = None):
|
||||
if batch_size is None:
|
||||
return "lower newer <image>"
|
||||
|
||||
if batch_size < 1:
|
||||
raise ValueError("batch_size must be greater than 0")
|
||||
|
||||
if batch_size == 1:
|
||||
return ["lower newer <image>"]
|
||||
return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
|
||||
batch_size - 2
|
||||
)
|
||||
|
||||
# Override tests as inputs_ids padded dimension is the second one but not the last one
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_kwargs_overrides_default_tokenizer_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer", max_length=30)
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=30)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 30)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_structured_kwargs_nested(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
# Define the kwargs for each modality
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
images=image_input,
|
||||
common_kwargs={"return_tensors": "pt"},
|
||||
images_kwargs={"max_image_size": {"longest_edge": 32}},
|
||||
text_kwargs={"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
|
||||
)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
self.assertEqual(inputs["pixel_values"].shape[3], 32)
|
||||
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 120)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_structured_kwargs_nested_from_dict(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
# Define the kwargs for each modality
|
||||
all_kwargs = {
|
||||
"common_kwargs": {"return_tensors": "pt"},
|
||||
"images_kwargs": {"max_image_size": {"longest_edge": 32}},
|
||||
"text_kwargs": {"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
|
||||
}
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, **all_kwargs)
|
||||
self.assertEqual(inputs["pixel_values"].shape[3], 32)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 120)
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_tokenizer_defaults_preserved_by_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer", max_length=30)
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 30)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_unstructured_kwargs_batched(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs(batch_size=2)
|
||||
image_input = self.prepare_image_inputs(batch_size=2)
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
images=image_input,
|
||||
return_tensors="pt",
|
||||
padding="longest",
|
||||
max_length=76,
|
||||
truncation=True,
|
||||
max_image_size={"longest_edge": 30},
|
||||
)
|
||||
|
||||
self.assertEqual(inputs["pixel_values"].shape[2], 3)
|
||||
self.assertEqual(inputs["pixel_values"].shape[3], 30)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_unstructured_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
images=image_input,
|
||||
return_tensors="pt",
|
||||
max_image_size={"longest_edge": 32},
|
||||
padding="max_length",
|
||||
max_length=120,
|
||||
truncation="longest_first",
|
||||
)
|
||||
|
||||
self.assertEqual(inputs["pixel_values"].shape[3], 32)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 120)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_text_only_inference(self):
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
from typing import Optional
|
||||
|
||||
from transformers import AutoProcessor, Llama4Processor, PreTrainedTokenizerFast
|
||||
from transformers.testing_utils import require_vision
|
||||
@@ -38,9 +37,10 @@ class Llama4ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
|
||||
image_processor = Llama4ImageProcessorFast(max_patches=1, size={"height": 20, "width": 20})
|
||||
tokenizer = PreTrainedTokenizerFast.from_pretrained("unsloth/Llama-3.2-11B-Vision-Instruct-unsloth-bnb-4bit")
|
||||
processor_kwargs = {}
|
||||
processor_kwargs = cls.prepare_processor_dict()
|
||||
processor = Llama4Processor(image_processor, tokenizer, **processor_kwargs)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
cls.image_token = processor.image_token
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
|
||||
@@ -51,21 +51,3 @@ class Llama4ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
shutil.rmtree(cls.tmpdirname)
|
||||
|
||||
# Override as Llama4Processor needs image tokens in prompts
|
||||
def prepare_text_inputs(self, batch_size: Optional[int] = None):
|
||||
if batch_size is None:
|
||||
return "lower newer <|image|>"
|
||||
|
||||
if batch_size < 1:
|
||||
raise ValueError("batch_size must be greater than 0")
|
||||
|
||||
if batch_size == 1:
|
||||
return ["lower newer <|image|>"]
|
||||
return ["lower newer <|image|>", "<|image|> upper older longer string"] + ["<|image|> lower newer"] * (
|
||||
batch_size - 2
|
||||
)
|
||||
|
||||
@unittest.skip("This test uses return_tensors='np' which is not supported")
|
||||
def test_image_chat_template_accepts_processing_kwargs(self):
|
||||
pass
|
||||
|
||||
@@ -43,6 +43,7 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_kwargs = cls.prepare_processor_dict()
|
||||
processor = LlavaProcessor(image_processor, tokenizer, **processor_kwargs)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
cls.image_token = processor.image_token
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
|
||||
@@ -58,18 +59,10 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def prepare_processor_dict():
|
||||
return {
|
||||
"chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
|
||||
"patch_size": 3,
|
||||
"patch_size": 128,
|
||||
"vision_feature_select_strategy": "default"
|
||||
} # fmt: skip
|
||||
|
||||
@unittest.skip(
|
||||
"Skip because the model has no processor kwargs except for chat template and"
|
||||
"chat template is saved as a separate file. Stop skipping this test when the processor"
|
||||
"has new kwargs saved in config file."
|
||||
)
|
||||
def test_processor_to_json_string(self):
|
||||
pass
|
||||
|
||||
def test_chat_template_is_saved(self):
|
||||
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
|
||||
processor_dict_loaded = json.loads(processor_loaded.to_json_string())
|
||||
|
||||
@@ -43,6 +43,7 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_kwargs = cls.prepare_processor_dict()
|
||||
processor = LlavaNextProcessor(image_processor, tokenizer, **processor_kwargs)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
cls.image_token = processor.image_token
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return LlavaNextProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
|
||||
@@ -54,18 +55,10 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def prepare_processor_dict():
|
||||
return {
|
||||
"chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
|
||||
"patch_size": 3,
|
||||
"patch_size": 128,
|
||||
"vision_feature_select_strategy": "default"
|
||||
} # fmt: skip
|
||||
|
||||
@unittest.skip(
|
||||
"Skip because the model has no processor kwargs except for chat template and"
|
||||
"chat template is saved as a separate file. Stop skipping this test when the processor"
|
||||
"has new kwargs saved in config file."
|
||||
)
|
||||
def test_processor_to_json_string(self):
|
||||
pass
|
||||
|
||||
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
|
||||
def test_chat_template_is_saved(self):
|
||||
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
|
||||
|
||||
@@ -47,6 +47,8 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
video_processor=video_processor, image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs
|
||||
)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
cls.image_token = processor.image_token
|
||||
cls.video_token = processor.video_token
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
|
||||
@@ -61,20 +63,11 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def prepare_processor_dict(cls):
|
||||
return {
|
||||
"chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + ' '}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>' }}{% endfor %}{# Render all video then #}{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ '<video>' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ '\n' + content['text'] }}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ '\n' + content['text'] }}{% endgeneration %}{% endfor %}{% endif %}{{'<|im_end|>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
||||
"num_additional_image_tokens": 6,
|
||||
"patch_size": 4,
|
||||
"num_additional_image_tokens": 0,
|
||||
"patch_size": 128,
|
||||
"vision_feature_select_strategy": "default",
|
||||
}
|
||||
|
||||
def test_processor_to_json_string(self):
|
||||
processor = self.get_processor()
|
||||
obj = json.loads(processor.to_json_string())
|
||||
for key, value in self.prepare_processor_dict().items():
|
||||
# chat_tempalate are tested as a separate test because they are saved in separate files
|
||||
if key != "chat_template":
|
||||
self.assertEqual(obj[key], value)
|
||||
self.assertEqual(getattr(processor, key, None), value)
|
||||
|
||||
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
|
||||
def test_chat_template_is_saved(self):
|
||||
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
|
||||
|
||||
@@ -51,6 +51,8 @@ class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
video_processor=video_processor, image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs
|
||||
)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
cls.image_token = processor.image_token
|
||||
cls.video_token = processor.video_token
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
|
||||
@@ -73,15 +75,6 @@ class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
"vision_feature_select_strategy": "default"
|
||||
} # fmt: skip
|
||||
|
||||
def test_processor_to_json_string(self):
|
||||
processor = self.get_processor()
|
||||
obj = json.loads(processor.to_json_string())
|
||||
for key, value in self.prepare_processor_dict().items():
|
||||
# chat_tempalate are tested as a separate test because they are saved in separate files
|
||||
if key != "chat_template":
|
||||
self.assertEqual(obj[key], value)
|
||||
self.assertEqual(getattr(processor, key, None), value)
|
||||
|
||||
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
|
||||
def test_chat_template_is_saved(self):
|
||||
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
import requests
|
||||
|
||||
from transformers import PixtralProcessor
|
||||
from transformers.testing_utils import require_read_token, require_vision
|
||||
from transformers.testing_utils import require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
@@ -34,7 +34,6 @@ if is_vision_available():
|
||||
|
||||
|
||||
@require_vision
|
||||
@require_read_token
|
||||
class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
"""This tests Pixtral processor with the new `spatial_merge_size` argument in Mistral3."""
|
||||
|
||||
@@ -49,30 +48,37 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
cls.url_2 = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
|
||||
cls.image_2 = Image.open(requests.get(cls.url_2, stream=True).raw)
|
||||
|
||||
def setUp(self):
|
||||
self.tmpdirname = tempfile.mkdtemp()
|
||||
cls.tmpdirname = tempfile.mkdtemp()
|
||||
cls.addClassCleanup(lambda tempdir=cls.tmpdirname: shutil.rmtree(tempdir))
|
||||
|
||||
processor_kwargs = cls.prepare_processor_dict()
|
||||
processor = PixtralProcessor.from_pretrained(
|
||||
"hf-internal-testing/Mistral-Small-3.1-24B-Instruct-2503-only-processor"
|
||||
"hf-internal-testing/Mistral-Small-3.1-24B-Instruct-2503-only-processor", **processor_kwargs
|
||||
)
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
cls.image_token = processor.image_token
|
||||
|
||||
def get_processor(self):
|
||||
return self.processor_class.from_pretrained(self.tmpdirname)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.tmpdirname)
|
||||
@staticmethod
|
||||
def prepare_processor_dict():
|
||||
return {
|
||||
"chat_template": "{%- set today = strftime_now(\"%Y-%m-%d\") %}\n{%- set default_system_message = \"You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\\nYour knowledge base was last updated on 2023-10-01. The current date is \" + today + \".\\n\\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \\\"What are some good restaurants around me?\\\" => \\\"Where are you?\\\" or \\\"When is the next flight to Tokyo\\\" => \\\"Where do you travel from?\\\")\" %}\n\n{{- bos_token }}\n\n{%- if messages[0]['role'] == 'system' %}\n {%- if messages[0] is string %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n {%- else %} \n {%- set system_message = messages[0]['content'][0]['text'] %}\n {%- set loop_messages = messages[1:] %}\n {%- endif %}\n{%- else %}\n {%- set system_message = default_system_message %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n{%- for message in loop_messages %}\n {%- if message['role'] == 'user' %}\n {%- if message['content'] is string %}\n {{- '[INST]' + message['content'] + '[/INST]' }}\n {%- else %}\n {{- '[INST]' }}\n {%- for block in message['content'] %}\n {%- if block['type'] == 'text' %}\n {{- block['text'] }}\n {%- elif block['type'] == 'image' or block['type'] == 'image_url' %}\n {{- '[IMG]' }}\n {%- else %}\n {{- raise_exception('Only text and image blocks are supported in message content!') }}\n {%- endif %}\n {%- endfor %}\n {{- '[/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'system' %}\n {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}\n {%- elif message['role'] == 'assistant' %}\n {%- if message['content'] is string %}\n {{- message['content'] + eos_token }}\n {%- else %}\n {{- message['content'][0]['text'] + eos_token }}\n {%- endif %}\n {%- else %}\n {{- raise_exception('Only user, system and assistant roles are supported!') }}\n {%- endif %}\n{%- endfor %}",
|
||||
"patch_size": 128,
|
||||
} # fmt: skip
|
||||
|
||||
def test_image_token_filling(self):
|
||||
processor = self.processor_class.from_pretrained(self.tmpdirname)
|
||||
# Important to check with non square image
|
||||
image = torch.randint(0, 2, (3, 500, 316))
|
||||
expected_image_tokens = 198
|
||||
expected_image_tokens = 4
|
||||
image_token_index = 10
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "",
|
||||
"content": [{"type": "text", "text": "You are a helpful assistant."}],
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
@@ -104,14 +110,14 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(len(inputs_image["input_ids"]) == 1)
|
||||
self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
|
||||
self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 30]))
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 36]))
|
||||
|
||||
# fmt: off
|
||||
input_ids = inputs_image["input_ids"]
|
||||
self.assertEqual(
|
||||
input_ids[0].tolist(),
|
||||
# Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:"
|
||||
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
[1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
@@ -121,36 +127,36 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(len(inputs_url["input_ids"]) == 1)
|
||||
self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
|
||||
self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 30]))
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 36]))
|
||||
|
||||
# fmt: off
|
||||
input_ids = inputs_url["input_ids"]
|
||||
self.assertEqual(
|
||||
input_ids[0].tolist(),
|
||||
# Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:"
|
||||
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
[1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
# Test passing inputs as a single list
|
||||
inputs_image = processor(text=prompt_string, images=[self.image_0], return_tensors="pt")
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 30]))
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 36]))
|
||||
|
||||
# fmt: off
|
||||
self.assertEqual(
|
||||
inputs_image["input_ids"][0].tolist(),
|
||||
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
[1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
# Test as nested single list
|
||||
inputs_image = processor(text=prompt_string, images=[[self.image_0]], return_tensors="pt")
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 30]))
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 36]))
|
||||
|
||||
# fmt: off
|
||||
self.assertEqual(
|
||||
inputs_image["input_ids"][0].tolist(),
|
||||
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
[1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
@@ -168,14 +174,14 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(len(inputs_image["input_ids"]) == 1)
|
||||
self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
|
||||
self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 30]))
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 36]))
|
||||
|
||||
# fmt: off
|
||||
input_ids = inputs_image["input_ids"]
|
||||
self.assertEqual(
|
||||
input_ids[0].tolist(),
|
||||
# Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
|
||||
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
[1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
@@ -185,25 +191,25 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(len(inputs_url["input_ids"]) == 1)
|
||||
self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
|
||||
self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 30]))
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 36]))
|
||||
|
||||
# fmt: off
|
||||
input_ids = inputs_url["input_ids"]
|
||||
self.assertEqual(
|
||||
input_ids[0].tolist(),
|
||||
# Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
|
||||
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
[1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
# Test passing in as a nested list
|
||||
inputs_url = processor(text=prompt_string, images=[[self.image_0, self.image_1]], return_tensors="pt")
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 30]))
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 36]))
|
||||
|
||||
# fmt: off
|
||||
self.assertEqual(
|
||||
inputs_url["input_ids"][0].tolist(),
|
||||
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
[1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
@@ -226,14 +232,14 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(len(inputs_image["input_ids"]) == 2)
|
||||
self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
|
||||
self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 30, 30]))
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 36, 36]))
|
||||
|
||||
# fmt: off
|
||||
input_ids = inputs_image["input_ids"]
|
||||
self.assertEqual(
|
||||
input_ids[0].tolist(),
|
||||
# Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
|
||||
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
[1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
@@ -243,14 +249,14 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(len(inputs_url["input_ids"]) == 2)
|
||||
self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
|
||||
self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 30, 30]))
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 36, 36]))
|
||||
|
||||
# fmt: off
|
||||
input_ids = inputs_url["input_ids"]
|
||||
self.assertEqual(
|
||||
input_ids[0].tolist(),
|
||||
# Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
|
||||
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
[1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
@@ -258,12 +264,12 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
inputs_image = processor(
|
||||
text=prompt_string, images=[self.image_0, self.image_1, self.image_2], return_tensors="pt", padding=True
|
||||
)
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 30, 30]))
|
||||
self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 36, 36]))
|
||||
|
||||
# fmt: off
|
||||
self.assertEqual(
|
||||
inputs_image["input_ids"][0].tolist(),
|
||||
[1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
[1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
|
||||
@@ -16,7 +16,6 @@ import json
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
@@ -333,20 +332,6 @@ class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
with self.assertRaises(ValueError):
|
||||
processor(text=text, images=None, padding=True)
|
||||
|
||||
# Override as MllamaProcessor needs image tokens in prompts
|
||||
def prepare_text_inputs(self, batch_size: Optional[int] = None):
|
||||
if batch_size is None:
|
||||
return "lower newer <|image|>"
|
||||
|
||||
if batch_size < 1:
|
||||
raise ValueError("batch_size must be greater than 0")
|
||||
|
||||
if batch_size == 1:
|
||||
return ["lower newer <|image|>"]
|
||||
return ["lower newer <|image|>", "<|image|> upper older longer string"] + ["<|image|> lower newer"] * (
|
||||
batch_size - 2
|
||||
)
|
||||
|
||||
def test_unstructured_kwargs_batched(self):
|
||||
# Overriden because Mllama expects images in nested format. For 2 images it can't infer
|
||||
# the correct nesting, so we better throw an error
|
||||
@@ -357,7 +342,7 @@ class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs(batch_size=2)
|
||||
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
|
||||
image_input = self.prepare_image_inputs(batch_size=2)
|
||||
image_input = [[image_input[0]], [image_input[1]]]
|
||||
inputs = processor(
|
||||
|
||||
@@ -37,10 +37,11 @@ class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def setUpClass(cls):
|
||||
cls.tmpdirname = tempfile.mkdtemp()
|
||||
image_processor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
|
||||
image_processor.image_seq_length = 0
|
||||
image_processor.image_seq_length = 0 # TODO: raushan fix me in #37342
|
||||
tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||
processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
cls.image_token = processor.image_token
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
|
||||
@@ -43,8 +43,11 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.tmpdirname = tempfile.mkdtemp()
|
||||
processor = Qwen2_5_VLProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", patch_size=4)
|
||||
processor = Qwen2_5_VLProcessor.from_pretrained(
|
||||
"Qwen/Qwen2-VL-7B-Instruct", patch_size=4, max_pixels=56 * 56, min_pixels=28 * 28
|
||||
)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
cls.image_token = processor.image_token
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
|
||||
@@ -52,8 +55,11 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def get_image_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
|
||||
|
||||
def prepare_processor_dict(self):
|
||||
return {"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"} # fmt: skip
|
||||
@staticmethod
|
||||
def prepare_processor_dict():
|
||||
return {
|
||||
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
|
||||
} # fmt: skip
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
@@ -206,7 +212,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(input_name in out_dict)
|
||||
self.assertEqual(len(out_dict["input_ids"]), batch_size)
|
||||
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
|
||||
self.assertEqual(len(out_dict[input_name]), batch_size * 19200)
|
||||
self.assertEqual(len(out_dict[input_name]), batch_size * 192)
|
||||
|
||||
return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
|
||||
for k in out_dict:
|
||||
@@ -261,7 +267,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
num_frames=num_frames,
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 115200)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 360)
|
||||
|
||||
# Load with `video_fps` arg
|
||||
video_fps = 1
|
||||
@@ -273,7 +279,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
video_fps=video_fps,
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 288000)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 900)
|
||||
|
||||
# Load with `video_fps` and `num_frames` args, should raise an error
|
||||
with self.assertRaises(ValueError):
|
||||
@@ -294,7 +300,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
return_dict=True,
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8640000)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 27000)
|
||||
|
||||
# Load video as a list of frames (i.e. images). NOTE: each frame should have same size
|
||||
# because we assume they come from one video
|
||||
@@ -312,7 +318,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
return_dict=True,
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 160)
|
||||
|
||||
def test_kwargs_overrides_custom_image_processor_kwargs(self):
|
||||
processor_components = self.prepare_components()
|
||||
@@ -328,7 +334,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
|
||||
self.assertEqual(inputs[self.images_input_name].shape[0], 612)
|
||||
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
|
||||
self.assertEqual(inputs[self.images_input_name].shape[0], 800)
|
||||
self.assertEqual(inputs[self.images_input_name].shape[0], 100)
|
||||
|
||||
@require_av
|
||||
def test_apply_chat_template_video_special_processing(self):
|
||||
@@ -395,4 +401,4 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
# Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
|
||||
formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
|
||||
self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1756800)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 21960)
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
from typing import Optional
|
||||
|
||||
from transformers import AutoProcessor, AutoTokenizer, Qwen2AudioProcessor, WhisperFeatureExtractor
|
||||
from transformers.testing_utils import require_torch, require_torchaudio
|
||||
@@ -40,6 +39,7 @@ class Qwen2AudioProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_kwargs = cls.prepare_processor_dict()
|
||||
processor = Qwen2AudioProcessor.from_pretrained(cls.checkpoint, **processor_kwargs)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
cls.audio_token = processor.audio_token
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
|
||||
@@ -57,20 +57,6 @@ class Qwen2AudioProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
"chat_template": "{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or content['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
|
||||
}
|
||||
|
||||
# Override as Qwen2AudioProcessor needs audio tokens in prompts
|
||||
def prepare_text_inputs(self, batch_size: Optional[int] = None):
|
||||
if batch_size is None:
|
||||
return "lower newer <|AUDIO|>"
|
||||
|
||||
if batch_size < 1:
|
||||
raise ValueError("batch_size must be greater than 0")
|
||||
|
||||
if batch_size == 1:
|
||||
return ["lower newer <|AUDIO|>"]
|
||||
return ["lower newer <|AUDIO|>", "<|AUDIO|> upper older longer string"] + ["<|AUDIO|> lower newer"] * (
|
||||
batch_size - 2
|
||||
)
|
||||
|
||||
def test_can_load_various_tokenizers(self):
|
||||
processor = Qwen2AudioProcessor.from_pretrained(self.checkpoint)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
|
||||
|
||||
@@ -43,8 +43,11 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.tmpdirname = tempfile.mkdtemp()
|
||||
processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", patch_size=4)
|
||||
processor = Qwen2VLProcessor.from_pretrained(
|
||||
"Qwen/Qwen2-VL-7B-Instruct", patch_size=4, max_pixels=56 * 56, min_pixels=28 * 28
|
||||
)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
cls.image_token = processor.image_token
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
|
||||
@@ -52,7 +55,8 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def get_image_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
|
||||
|
||||
def prepare_processor_dict(self):
|
||||
@staticmethod
|
||||
def prepare_processor_dict():
|
||||
return {"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"} # fmt: skip
|
||||
|
||||
@classmethod
|
||||
@@ -203,7 +207,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(input_name in out_dict)
|
||||
self.assertEqual(len(out_dict["input_ids"]), batch_size)
|
||||
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
|
||||
self.assertEqual(len(out_dict[input_name]), batch_size * 19200)
|
||||
self.assertEqual(len(out_dict[input_name]), batch_size * 192)
|
||||
|
||||
return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
|
||||
for k in out_dict:
|
||||
@@ -258,7 +262,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
num_frames=num_frames,
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 115200)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 360)
|
||||
|
||||
# Load with `video_fps` arg
|
||||
video_fps = 1
|
||||
@@ -270,7 +274,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
video_fps=video_fps,
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 288000)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 900)
|
||||
|
||||
# Load with `video_fps` and `num_frames` args, should raise an error
|
||||
with self.assertRaises(ValueError):
|
||||
@@ -291,7 +295,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
return_dict=True,
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8640000)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 27000)
|
||||
|
||||
# Load video as a list of frames (i.e. images). NOTE: each frame should have same size
|
||||
# because we assume they come from one video
|
||||
@@ -309,7 +313,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
return_dict=True,
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 160)
|
||||
|
||||
@require_av
|
||||
def test_apply_chat_template_video_special_processing(self):
|
||||
@@ -376,7 +380,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
# Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
|
||||
formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
|
||||
self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1756800)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 21960)
|
||||
|
||||
def test_kwargs_overrides_custom_image_processor_kwargs(self):
|
||||
processor_components = self.prepare_components()
|
||||
@@ -390,6 +394,6 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
|
||||
self.assertEqual(inputs[self.images_input_name].shape[0], 800)
|
||||
self.assertEqual(inputs[self.images_input_name].shape[0], 100)
|
||||
inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
|
||||
self.assertEqual(inputs[self.images_input_name].shape[0], 612)
|
||||
|
||||
@@ -16,7 +16,6 @@ import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
from io import BytesIO
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
@@ -42,7 +41,8 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.tmpdirname = tempfile.mkdtemp()
|
||||
processor = SmolVLMProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct", image_seq_len=2)
|
||||
processor_kwargs = cls.prepare_processor_dict()
|
||||
processor = SmolVLMProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct", **processor_kwargs)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
cls.image1 = Image.open(
|
||||
BytesIO(
|
||||
@@ -82,9 +82,10 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def get_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def prepare_processor_dict(self):
|
||||
@staticmethod
|
||||
def prepare_processor_dict():
|
||||
return {
|
||||
"image_seq_len": self.image_seq_len,
|
||||
"image_seq_len": 2,
|
||||
"chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
|
||||
}
|
||||
|
||||
@@ -426,106 +427,6 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
# NOTE: the last assert checks are removed
|
||||
# Loading video as a list of frames (i.e. images) is not supported in SmolVLM
|
||||
|
||||
# Override as SmolVLMProcessor needs image tokens in prompts
|
||||
def prepare_text_inputs(self, batch_size: Optional[int] = None):
|
||||
if batch_size is None:
|
||||
return "lower newer <image>"
|
||||
|
||||
if batch_size < 1:
|
||||
raise ValueError("batch_size must be greater than 0")
|
||||
|
||||
if batch_size == 1:
|
||||
return ["lower newer <image>"]
|
||||
return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
|
||||
batch_size - 2
|
||||
)
|
||||
|
||||
# Override tests as inputs_ids padded dimension is the second one but not the last one
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_kwargs_overrides_default_tokenizer_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer", max_length=30)
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=30)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 30)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_structured_kwargs_nested(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
# Define the kwargs for each modality
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
images=image_input,
|
||||
common_kwargs={"return_tensors": "pt"},
|
||||
images_kwargs={"max_image_size": {"longest_edge": 32}},
|
||||
text_kwargs={"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
|
||||
)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
self.assertEqual(inputs["pixel_values"].shape[3], 32)
|
||||
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 120)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_structured_kwargs_nested_from_dict(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
# Define the kwargs for each modality
|
||||
all_kwargs = {
|
||||
"common_kwargs": {"return_tensors": "pt"},
|
||||
"images_kwargs": {"max_image_size": {"longest_edge": 32}},
|
||||
"text_kwargs": {"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
|
||||
}
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, **all_kwargs)
|
||||
self.assertEqual(inputs["pixel_values"].shape[3], 32)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 120)
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_tokenizer_defaults_preserved_by_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer", max_length=30)
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 30)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_unstructured_kwargs_batched(self):
|
||||
@@ -537,7 +438,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs(batch_size=2)
|
||||
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
|
||||
image_input = self.prepare_image_inputs(batch_size=2)
|
||||
image_input = [[image_input[0]], [image_input[1]]]
|
||||
inputs = processor(
|
||||
@@ -554,32 +455,6 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertEqual(inputs["pixel_values"].shape[3], 30)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_unstructured_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
images=image_input,
|
||||
return_tensors="pt",
|
||||
max_image_size={"longest_edge": 32},
|
||||
padding="max_length",
|
||||
max_length=120,
|
||||
truncation="longest_first",
|
||||
)
|
||||
|
||||
self.assertEqual(inputs["pixel_values"].shape[3], 32)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 120)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_text_only_inference(self):
|
||||
|
||||
@@ -92,7 +92,8 @@ class ProcessorTesterMixin:
|
||||
videos_input_name = "pixel_values_videos"
|
||||
audio_input_name = "input_features"
|
||||
|
||||
def prepare_processor_dict(self):
|
||||
@staticmethod
|
||||
def prepare_processor_dict():
|
||||
return {}
|
||||
|
||||
def get_component(self, attribute, **kwargs):
|
||||
@@ -123,18 +124,23 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(**components, **self.prepare_processor_dict())
|
||||
return processor
|
||||
|
||||
# TODO: raushan unify all these special token LLMs under the general preparation. We can get audio/image token
|
||||
# from tokenizer, so we can generalize instead of overriding
|
||||
def prepare_text_inputs(self, batch_size: Optional[int] = None):
|
||||
def prepare_text_inputs(self, batch_size: Optional[int] = None, modality: Optional[str] = None):
|
||||
if modality is not None:
|
||||
special_token_to_add = getattr(self, f"{modality}_token", "")
|
||||
else:
|
||||
special_token_to_add = ""
|
||||
|
||||
if batch_size is None:
|
||||
return "lower newer"
|
||||
return f"lower newer {special_token_to_add}"
|
||||
|
||||
if batch_size < 1:
|
||||
raise ValueError("batch_size must be greater than 0")
|
||||
|
||||
if batch_size == 1:
|
||||
return ["lower newer"]
|
||||
return ["lower newer", "upper older longer string"] + ["lower newer"] * (batch_size - 2)
|
||||
return [f"lower newer {special_token_to_add}"]
|
||||
return [f"lower newer {special_token_to_add}", f" {special_token_to_add} upper older longer string"] + [
|
||||
f"lower newer {special_token_to_add}"
|
||||
] * (batch_size - 2)
|
||||
|
||||
@require_vision
|
||||
def prepare_image_inputs(self, batch_size: Optional[int] = None):
|
||||
@@ -159,6 +165,13 @@ class ProcessorTesterMixin:
|
||||
for key, value in self.prepare_processor_dict().items():
|
||||
# Chat template is saved as a separate file
|
||||
if key not in "chat_template":
|
||||
# json converts dict keys to str, but some processors force convert back to int when init
|
||||
if (
|
||||
isinstance(obj[key], dict)
|
||||
and isinstance(list(obj[key].keys())[0], str)
|
||||
and isinstance(list(value.keys())[0], int)
|
||||
):
|
||||
obj[key] = {int(k): v for k, v in obj[key].items()}
|
||||
self.assertEqual(obj[key], value)
|
||||
self.assertEqual(getattr(processor, key, None), value)
|
||||
|
||||
@@ -206,7 +219,7 @@ class ProcessorTesterMixin:
|
||||
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
input_str = self.prepare_text_inputs(modality="image")
|
||||
image_input = self.prepare_image_inputs()
|
||||
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
|
||||
self.assertEqual(inputs[self.text_input_name].shape[-1], 117)
|
||||
@@ -229,7 +242,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
input_str = self.prepare_text_inputs(modality="image")
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
|
||||
@@ -244,7 +257,7 @@ class ProcessorTesterMixin:
|
||||
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
input_str = self.prepare_text_inputs(modality="image")
|
||||
image_input = self.prepare_image_inputs()
|
||||
inputs = processor(
|
||||
text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
|
||||
@@ -264,7 +277,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
input_str = self.prepare_text_inputs(modality="image")
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt")
|
||||
@@ -278,7 +291,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
input_str = self.prepare_text_inputs(modality="image")
|
||||
image_input = self.prepare_image_inputs()
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
@@ -301,7 +314,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs(batch_size=2)
|
||||
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
|
||||
image_input = self.prepare_image_inputs(batch_size=2)
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
@@ -327,7 +340,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = [self.prepare_text_inputs()]
|
||||
input_str = [self.prepare_text_inputs(modality="image")]
|
||||
image_input = self.prepare_image_inputs()
|
||||
with self.assertRaises(ValueError):
|
||||
_ = processor(
|
||||
@@ -346,7 +359,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
input_str = self.prepare_text_inputs(modality="image")
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
# Define the kwargs for each modality
|
||||
@@ -369,7 +382,7 @@ class ProcessorTesterMixin:
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
input_str = self.prepare_text_inputs(modality="image")
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
# Define the kwargs for each modality
|
||||
@@ -396,7 +409,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs(batch_size=3)
|
||||
input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
|
||||
raw_speech = floats_list((3, 1000))
|
||||
raw_speech = [np.asarray(audio) for audio in raw_speech]
|
||||
inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt")
|
||||
@@ -414,7 +427,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs(batch_size=3)
|
||||
input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
|
||||
raw_speech = floats_list((3, 1000))
|
||||
raw_speech = [np.asarray(audio) for audio in raw_speech]
|
||||
inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=300, padding="max_length")
|
||||
@@ -433,7 +446,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs(batch_size=3)
|
||||
input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
|
||||
raw_speech = floats_list((3, 1000))
|
||||
raw_speech = [np.asarray(audio) for audio in raw_speech]
|
||||
inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=300, padding="max_length")
|
||||
@@ -452,7 +465,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs(batch_size=3)
|
||||
input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
|
||||
raw_speech = floats_list((3, 1000))
|
||||
raw_speech = [np.asarray(audio) for audio in raw_speech]
|
||||
with self.assertRaises(ValueError):
|
||||
@@ -476,7 +489,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs(batch_size=3)
|
||||
input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
|
||||
raw_speech = floats_list((3, 1000))
|
||||
raw_speech = [np.asarray(audio) for audio in raw_speech]
|
||||
|
||||
@@ -499,7 +512,7 @@ class ProcessorTesterMixin:
|
||||
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
input_str = self.prepare_text_inputs(modality="video")
|
||||
video_input = self.prepare_video_inputs()
|
||||
inputs = processor(text=input_str, videos=video_input, return_tensors="pt")
|
||||
self.assertEqual(inputs[self.text_input_name].shape[-1], 117)
|
||||
@@ -522,7 +535,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
input_str = self.prepare_text_inputs(modality="video")
|
||||
video_input = self.prepare_video_inputs()
|
||||
|
||||
inputs = processor(text=input_str, videos=video_input, return_tensors="pt")
|
||||
@@ -537,7 +550,7 @@ class ProcessorTesterMixin:
|
||||
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
input_str = self.prepare_text_inputs(modality="video")
|
||||
video_input = self.prepare_video_inputs()
|
||||
inputs = processor(
|
||||
text=input_str, videos=video_input, return_tensors="pt", max_length=112, padding="max_length"
|
||||
@@ -557,7 +570,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
input_str = self.prepare_text_inputs(modality="video")
|
||||
video_input = self.prepare_video_inputs()
|
||||
|
||||
inputs = processor(text=input_str, videos=video_input, do_rescale=True, rescale_factor=-1, return_tensors="pt")
|
||||
@@ -571,7 +584,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
input_str = self.prepare_text_inputs(modality="video")
|
||||
video_input = self.prepare_video_inputs()
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
@@ -594,7 +607,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs(batch_size=2)
|
||||
input_str = self.prepare_text_inputs(batch_size=2, modality="video")
|
||||
video_input = self.prepare_video_inputs(batch_size=2)
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
@@ -620,7 +633,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = [self.prepare_text_inputs()]
|
||||
input_str = [self.prepare_text_inputs(modality="video")]
|
||||
video_input = self.prepare_video_inputs()
|
||||
with self.assertRaises(ValueError):
|
||||
_ = processor(
|
||||
@@ -639,7 +652,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
input_str = self.prepare_text_inputs(modality="video")
|
||||
video_input = self.prepare_video_inputs()
|
||||
|
||||
# Define the kwargs for each modality
|
||||
@@ -662,7 +675,7 @@ class ProcessorTesterMixin:
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
input_str = self.prepare_text_inputs(modality="video")
|
||||
video_input = self.prepare_video_inputs()
|
||||
|
||||
# Define the kwargs for each modality
|
||||
@@ -686,7 +699,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(**processor_components)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
input_str = self.prepare_text_inputs(modality="image")
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
@@ -713,7 +726,7 @@ class ProcessorTesterMixin:
|
||||
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs(batch_size=3)
|
||||
input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
|
||||
audio_lengths = [4000, 8000, 16000, 32000]
|
||||
raw_speech = [np.asarray(audio)[:length] for audio, length in zip(floats_list((3, 32_000)), audio_lengths)]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user