From d7188ba600e36d3fd191b12e19f1b3bb81a8404f Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Thu, 30 Jan 2025 16:49:20 -0500 Subject: [PATCH] Add support for nested images to LLava and VipLLava (#35558) * move make_flat_list_of_images and make_batched_videos to image_utils * remove unnecessary is_vision_available * move make_nested_list_of_images to image_utils * fix fast pixtral image processor * fix import mllama * fix make_nested_list_of_images * add tests * convert 4d arrays/tensors to list * add test_make_batched_videos * add support nested batch of videos * fix image processing qwen2vl --- src/transformers/image_utils.py | 110 +++++- .../models/aria/image_processing_aria.py | 27 +- src/transformers/models/aria/modular_aria.py | 5 +- .../models/blip/image_processing_blip.py | 5 +- .../chameleon/image_processing_chameleon.py | 27 +- .../models/clip/image_processing_clip.py | 4 +- .../models/colpali/modular_colpali.py | 5 +- .../models/colpali/processing_colpali.py | 27 +- .../idefics2/image_processing_idefics2.py | 37 +- .../idefics3/image_processing_idefics3.py | 38 +- .../image_processing_instructblipvideo.py | 29 +- .../llava_next/image_processing_llava_next.py | 27 +- .../image_processing_llava_next_video.py | 26 +- .../image_processing_llava_onevision.py | 28 +- .../video_processing_llava_onevision.py | 28 +- .../models/mllama/image_processing_mllama.py | 40 +- .../models/mllama/processing_mllama.py | 7 +- .../models/paligemma/processing_paligemma.py | 28 +- .../qwen2_5_vl/image_processing_qwen2_5_vl.py | 50 +-- .../qwen2_vl/image_processing_qwen2_vl.py | 51 +-- .../image_processing_qwen2_vl_fast.py | 6 +- .../models/siglip/image_processing_siglip.py | 4 +- .../image_processing_video_llava.py | 26 +- tests/models/colpali/test_modeling_colpali.py | 4 - .../test_modeling_instructblipvideo.py | 6 +- tests/models/pixtral/test_modeling_pixtral.py | 5 - tests/utils/test_image_utils.py | 341 +++++++++++++++++- 27 files changed, 506 insertions(+), 485 deletions(-) diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index 90b5f44c56..4f8b5980a6 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -158,6 +158,10 @@ def is_valid_image(img): return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img) or is_tf_tensor(img) or is_jax_tensor(img) +def is_valid_list_of_images(images: List): + return images and all(is_valid_image(image) for image in images) + + def valid_images(imgs): # If we have an list of images, make sure every image is valid if isinstance(imgs, (list, tuple)): @@ -189,7 +193,7 @@ def is_scaled_image(image: np.ndarray) -> bool: def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]: """ - Ensure that the input is a list of images. If the input is a single image, it is converted to a list of length 1. + Ensure that the output is a list of images. If the input is a single image, it is converted to a list of length 1. If the input is a batch of images, it is converted to a list of images. Args: @@ -203,7 +207,7 @@ def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]: return images # Either the input is a single image, in which case we create a list of length 1 - if isinstance(images, PIL.Image.Image): + if is_pil_image(images): # PIL images are never batched return [images] @@ -226,6 +230,108 @@ def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]: ) +def make_flat_list_of_images( + images: Union[List[ImageInput], ImageInput], +) -> ImageInput: + """ + Ensure that the output is a flat list of images. If the input is a single image, it is converted to a list of length 1. + If the input is a nested list of images, it is converted to a flat list of images. + Args: + images (`Union[List[ImageInput], ImageInput]`): + The input image. + Returns: + list: A list of images or a 4d array of images. + """ + # If the input is a nested list of images, we flatten it + if ( + isinstance(images, (list, tuple)) + and all(isinstance(images_i, (list, tuple)) for images_i in images) + and all(is_valid_list_of_images(images_i) for images_i in images) + ): + return [img for img_list in images for img in img_list] + + if isinstance(images, (list, tuple)) and is_valid_list_of_images(images): + if is_pil_image(images[0]) or images[0].ndim == 3: + return images + if images[0].ndim == 4: + return [img for img_list in images for img in img_list] + + if is_valid_image(images): + if is_pil_image(images) or images.ndim == 3: + return [images] + if images.ndim == 4: + return list(images) + + raise ValueError(f"Could not make a flat list of images from {images}") + + +def make_nested_list_of_images( + images: Union[List[ImageInput], ImageInput], +) -> ImageInput: + """ + Ensure that the output is a nested list of images. + Args: + images (`Union[List[ImageInput], ImageInput]`): + The input image. + Returns: + list: A list of list of images or a list of 4d array of images. + """ + # If it's a list of batches, it's already in the right format + if ( + isinstance(images, (list, tuple)) + and all(isinstance(images_i, (list, tuple)) for images_i in images) + and all(is_valid_list_of_images(images_i) for images_i in images) + ): + return images + + # If it's a list of images, it's a single batch, so convert it to a list of lists + if isinstance(images, (list, tuple)) and is_valid_list_of_images(images): + if is_pil_image(images[0]) or images[0].ndim == 3: + return [images] + if images[0].ndim == 4: + return [list(image) for image in images] + + # If it's a single image, convert it to a list of lists + if is_valid_image(images): + if is_pil_image(images) or images.ndim == 3: + return [[images]] + if images.ndim == 4: + return [list(images)] + + raise ValueError("Invalid input type. Must be a single image, a list of images, or a list of batches of images.") + + +def make_batched_videos(videos) -> VideoInput: + """ + Ensure that the input is a list of videos. + Args: + videos (`VideoInput`): + Video or videos to turn into a list of videos. + Returns: + list: A list of videos. + """ + if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): + # case 1: nested batch of videos so we flatten it + if not is_pil_image(videos[0][0]) and videos[0][0].ndim == 4: + videos = [video for batch_list in videos for video in batch_list] + # case 2: list of videos represented as list of video frames + return videos + + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): + if is_pil_image(videos[0]) or videos[0].ndim == 3: + return [videos] + elif videos[0].ndim == 4: + return [list(video) for video in videos] + + elif is_valid_image(videos): + if is_pil_image(videos) or videos.ndim == 3: + return [[videos]] + elif videos.ndim == 4: + return [list(videos)] + + raise ValueError(f"Could not make batched video from {videos}") + + def to_numpy_array(img) -> np.ndarray: if not is_valid_image(img): raise ValueError(f"Invalid image type: {type(img)}") diff --git a/src/transformers/models/aria/image_processing_aria.py b/src/transformers/models/aria/image_processing_aria.py index 7b00665aa2..de8637eb28 100644 --- a/src/transformers/models/aria/image_processing_aria.py +++ b/src/transformers/models/aria/image_processing_aria.py @@ -31,7 +31,7 @@ from ...image_utils import ( PILImageResampling, get_image_size, infer_channel_dimension_format, - is_valid_image, + make_flat_list_of_images, to_numpy_array, valid_images, validate_preprocess_arguments, @@ -39,29 +39,6 @@ from ...image_utils import ( from ...utils import TensorType -def make_batched_images(images) -> List[List[ImageInput]]: - """ - Accepts images in list or nested list format, and makes a list of images for preprocessing. - - Args: - images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): - The input image. - - Returns: - list: A list of images. - """ - if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): - return [img for img_list in images for img in img_list] - - elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): - return images - - elif is_valid_image(images): - return [images] - - raise ValueError(f"Could not make batched video from {images}") - - def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]: """ Divides an image into patches of a specified size. @@ -244,7 +221,7 @@ class AriaImageProcessor(BaseImageProcessor): if max_image_size not in [490, 980]: raise ValueError("max_image_size must be either 490 or 980") - images = make_batched_images(images) + images = make_flat_list_of_images(images) if not valid_images(images): raise ValueError( diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index 8bb79616ea..5c348ae1ee 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -28,6 +28,7 @@ from ...image_utils import ( PILImageResampling, get_image_size, infer_channel_dimension_format, + make_flat_list_of_images, to_numpy_array, valid_images, validate_preprocess_arguments, @@ -58,7 +59,7 @@ from ..llama.modeling_llama import ( LlamaRMSNorm, ) from ..llava.modeling_llava import LlavaCausalLMOutputWithPast -from ..llava_next.image_processing_llava_next import divide_to_patches, make_batched_images +from ..llava_next.image_processing_llava_next import divide_to_patches logger = logging.get_logger(__name__) @@ -609,7 +610,7 @@ class AriaImageProcessor(BaseImageProcessor): if max_image_size not in [490, 980]: raise ValueError("max_image_size must be either 490 or 980") - images = make_batched_images(images) + images = make_flat_list_of_images(images) if not valid_images(images): raise ValueError( diff --git a/src/transformers/models/blip/image_processing_blip.py b/src/transformers/models/blip/image_processing_blip.py index 0f7683d08d..df2aee157d 100644 --- a/src/transformers/models/blip/image_processing_blip.py +++ b/src/transformers/models/blip/image_processing_blip.py @@ -28,7 +28,7 @@ from ...image_utils import ( PILImageResampling, infer_channel_dimension_format, is_scaled_image, - make_list_of_images, + make_flat_list_of_images, to_numpy_array, valid_images, validate_preprocess_arguments, @@ -231,8 +231,7 @@ class BlipImageProcessor(BaseImageProcessor): size = size if size is not None else self.size size = get_size_dict(size, default_to_square=False) - - images = make_list_of_images(images) + images = make_flat_list_of_images(images) if not valid_images(images): raise ValueError( diff --git a/src/transformers/models/chameleon/image_processing_chameleon.py b/src/transformers/models/chameleon/image_processing_chameleon.py index 4ef305c511..c9d110ad22 100644 --- a/src/transformers/models/chameleon/image_processing_chameleon.py +++ b/src/transformers/models/chameleon/image_processing_chameleon.py @@ -30,7 +30,7 @@ from ...image_utils import ( PILImageResampling, infer_channel_dimension_format, is_scaled_image, - is_valid_image, + make_flat_list_of_images, to_numpy_array, valid_images, validate_preprocess_arguments, @@ -44,29 +44,6 @@ if is_vision_available(): import PIL -def make_batched_images(images) -> List[List[ImageInput]]: - """ - Accepts images in list or nested list format, and makes a list of images for preprocessing. - - Args: - images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): - The input image. - - Returns: - list: A list of images. - """ - if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): - return [img for img_list in images for img in img_list] - - elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): - return images - - elif is_valid_image(images): - return [images] - - raise ValueError(f"Could not make batched video from {images}") - - class ChameleonImageProcessor(BaseImageProcessor): r""" Constructs a Chameleon image processor. @@ -275,7 +252,7 @@ class ChameleonImageProcessor(BaseImageProcessor): image_std = image_std if image_std is not None else self.image_std do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb - images = make_batched_images(images) + images = make_flat_list_of_images(images) if not valid_images(images): raise ValueError( diff --git a/src/transformers/models/clip/image_processing_clip.py b/src/transformers/models/clip/image_processing_clip.py index c81451b195..2155b306bc 100644 --- a/src/transformers/models/clip/image_processing_clip.py +++ b/src/transformers/models/clip/image_processing_clip.py @@ -33,7 +33,7 @@ from ...image_utils import ( PILImageResampling, infer_channel_dimension_format, is_scaled_image, - make_list_of_images, + make_flat_list_of_images, to_numpy_array, valid_images, validate_kwargs, @@ -283,7 +283,7 @@ class CLIPImageProcessor(BaseImageProcessor): validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) - images = make_list_of_images(images) + images = make_flat_list_of_images(images) if not valid_images(images): raise ValueError( diff --git a/src/transformers/models/colpali/modular_colpali.py b/src/transformers/models/colpali/modular_colpali.py index ceb43e2d66..2cc6dded85 100644 --- a/src/transformers/models/colpali/modular_colpali.py +++ b/src/transformers/models/colpali/modular_colpali.py @@ -20,11 +20,10 @@ from transformers.models.paligemma.processing_paligemma import ( IMAGE_TOKEN, PaliGemmaProcessor, build_string_from_input, - make_batched_images, ) from ...feature_extraction_utils import BatchFeature -from ...image_utils import ImageInput, is_valid_image +from ...image_utils import ImageInput, is_valid_image, make_flat_list_of_images from ...processing_utils import ( ProcessingKwargs, Unpack, @@ -168,7 +167,7 @@ class ColPaliProcessor(PaliGemmaProcessor): ) for prompt, image_list in zip(texts_doc, images) ] - images = make_batched_images(images) + images = make_flat_list_of_images(images) pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"] # max_length has to account for the image tokens diff --git a/src/transformers/models/colpali/processing_colpali.py b/src/transformers/models/colpali/processing_colpali.py index f8d6867579..342cd0cd3d 100644 --- a/src/transformers/models/colpali/processing_colpali.py +++ b/src/transformers/models/colpali/processing_colpali.py @@ -23,7 +23,7 @@ from typing import ClassVar, List, Optional, Union from ...feature_extraction_utils import BatchFeature -from ...image_utils import ImageInput, is_valid_image +from ...image_utils import ImageInput, is_valid_image, make_flat_list_of_images from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput from ...utils import is_torch_available @@ -72,29 +72,6 @@ def build_string_from_input(prompt, bos_token, image_seq_len, image_token, num_i return f"{image_token * image_seq_len * num_images}{bos_token}{prompt}\n" -def make_batched_images(images) -> List[List[ImageInput]]: - """ - Accepts images in list or nested list format, and makes a list of images for preprocessing. - - Args: - images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): - The input image. - - Returns: - list: A list of images. - """ - if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): - return [img for img_list in images for img in img_list] - - elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): - return images - - elif is_valid_image(images): - return [images] - - raise ValueError(f"Could not make batched video from {images}") - - class ColPaliProcessor(ProcessorMixin): r""" Constructs a ColPali processor which wraps a PaliGemmaProcessor and special methods to process images and queries, as @@ -230,7 +207,7 @@ class ColPaliProcessor(ProcessorMixin): ) for prompt, image_list in zip(texts_doc, images) ] - images = make_batched_images(images) + images = make_flat_list_of_images(images) pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"] # max_length has to account for the image tokens diff --git a/src/transformers/models/idefics2/image_processing_idefics2.py b/src/transformers/models/idefics2/image_processing_idefics2.py index 65d5a82854..927aba761c 100644 --- a/src/transformers/models/idefics2/image_processing_idefics2.py +++ b/src/transformers/models/idefics2/image_processing_idefics2.py @@ -29,7 +29,7 @@ from ...image_utils import ( get_image_size, infer_channel_dimension_format, is_scaled_image, - is_valid_image, + make_nested_list_of_images, to_numpy_array, valid_images, validate_preprocess_arguments, @@ -77,39 +77,6 @@ def get_resize_output_image_size(image, size, input_data_format) -> Tuple[int, i return height, width -def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]: - """ - Convert a single image or a list of images to a list of numpy arrays. - - Args: - images (`ImageInput`): - A single image or a list of images. - - Returns: - A list of numpy arrays. - """ - # If it's a single image, convert it to a list of lists - if is_valid_image(images): - images = [[images]] - # If it's a list of images, it's a single batch, so convert it to a list of lists - elif isinstance(images, (list, tuple)) and len(images) > 0 and is_valid_image(images[0]): - images = [images] - # If it's a list of batches, it's already in the right format - elif ( - isinstance(images, (list, tuple)) - and len(images) > 0 - and isinstance(images[0], (list, tuple)) - and len(images[0]) > 0 - and is_valid_image(images[0][0]) - ): - pass - else: - raise ValueError( - "Invalid input type. Must be a single image, a list of images, or a list of batches of images." - ) - return images - - # Copied from transformers.models.detr.image_processing_detr.max_across_indices def max_across_indices(values: Iterable[Any]) -> List[Any]: """ @@ -504,7 +471,7 @@ class Idefics2ImageProcessor(BaseImageProcessor): do_pad = do_pad if do_pad is not None else self.do_pad do_image_splitting = do_image_splitting if do_image_splitting is not None else self.do_image_splitting - images_list = make_list_of_images(images) + images_list = make_nested_list_of_images(images) if not valid_images(images_list[0]): raise ValueError( diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py index df71a8bf0e..b8b30609b8 100644 --- a/src/transformers/models/idefics3/image_processing_idefics3.py +++ b/src/transformers/models/idefics3/image_processing_idefics3.py @@ -29,7 +29,7 @@ from ...image_utils import ( get_image_size, infer_channel_dimension_format, is_scaled_image, - is_valid_image, + make_nested_list_of_images, to_numpy_array, valid_images, validate_preprocess_arguments, @@ -141,40 +141,6 @@ def get_resize_output_image_size( return height, width -# Copied from transformers.models.idefics2.image_processing_idefics2.make_list_of_images -def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]: - """ - Convert a single image or a list of images to a list of numpy arrays. - - Args: - images (`ImageInput`): - A single image or a list of images. - - Returns: - A list of numpy arrays. - """ - # If it's a single image, convert it to a list of lists - if is_valid_image(images): - images = [[images]] - # If it's a list of images, it's a single batch, so convert it to a list of lists - elif isinstance(images, (list, tuple)) and len(images) > 0 and is_valid_image(images[0]): - images = [images] - # If it's a list of batches, it's already in the right format - elif ( - isinstance(images, (list, tuple)) - and len(images) > 0 - and isinstance(images[0], (list, tuple)) - and len(images[0]) > 0 - and is_valid_image(images[0][0]) - ): - pass - else: - raise ValueError( - "Invalid input type. Must be a single image, a list of images, or a list of batches of images." - ) - return images - - # Copied from transformers.models.detr.image_processing_detr.max_across_indices def max_across_indices(values: Iterable[Any]) -> List[Any]: """ @@ -720,7 +686,7 @@ class Idefics3ImageProcessor(BaseImageProcessor): do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb do_pad = do_pad if do_pad is not None else self.do_pad - images_list = make_list_of_images(images) + images_list = make_nested_list_of_images(images) if not valid_images(images_list[0]): raise ValueError( diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py index 75e07317b0..37cec22a9b 100644 --- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py @@ -32,40 +32,17 @@ from ...image_utils import ( VideoInput, infer_channel_dimension_format, is_scaled_image, - is_valid_image, + make_batched_videos, to_numpy_array, valid_images, validate_preprocess_arguments, ) -from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging - - -if is_vision_available(): - import PIL +from ...utils import TensorType, filter_out_non_signature_kwargs, logging logger = logging.get_logger(__name__) -def make_batched_videos(videos) -> List[VideoInput]: - if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos - - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if isinstance(videos[0], PIL.Image.Image): - return [videos] - elif len(videos[0].shape) == 4: - return [list(video) for video in videos] - - elif is_valid_image(videos): - if isinstance(videos, PIL.Image.Image): - return [[videos]] - elif len(videos.shape) == 4: - return [list(videos)] - - raise ValueError(f"Could not make batched video from {videos}") - - # Copied from transformers.models.blip.image_processing_blip.BlipImageProcessor with Blip->InstructBlipVideo, BLIP->InstructBLIPVideo class InstructBlipVideoImageProcessor(BaseImageProcessor): r""" @@ -198,7 +175,7 @@ class InstructBlipVideoImageProcessor(BaseImageProcessor): do_convert_rgb: bool = None, data_format: ChannelDimension = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> PIL.Image.Image: + ) -> BatchFeature: """ Preprocess a video or batch of images/videos. diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py index 8e2a4f4644..742ed4cbab 100644 --- a/src/transformers/models/llava_next/image_processing_llava_next.py +++ b/src/transformers/models/llava_next/image_processing_llava_next.py @@ -37,7 +37,7 @@ from ...image_utils import ( get_image_size, infer_channel_dimension_format, is_scaled_image, - is_valid_image, + make_flat_list_of_images, make_list_of_images, to_numpy_array, valid_images, @@ -53,29 +53,6 @@ if is_vision_available(): from PIL import Image -def make_batched_images(images) -> List[List[ImageInput]]: - """ - Accepts images in list or nested list format, and makes a list of images for preprocessing. - - Args: - images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): - The input image. - - Returns: - list: A list of images. - """ - if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): - return [img for img_list in images for img in img_list] - - elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): - return images - - elif is_valid_image(images): - return [images] - - raise ValueError(f"Could not make batched video from {images}") - - def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]: """ Divides an image into patches of a specified size. @@ -670,7 +647,7 @@ class LlavaNextImageProcessor(BaseImageProcessor): do_pad = do_pad if do_pad is not None else self.do_pad do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb - images = make_batched_images(images) + images = make_flat_list_of_images(images) if not valid_images(images): raise ValueError( diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py index 81f55f9373..3ec8d9db06 100644 --- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py @@ -34,37 +34,17 @@ from ...image_utils import ( VideoInput, infer_channel_dimension_format, is_scaled_image, - is_valid_image, + make_batched_videos, make_list_of_images, to_numpy_array, validate_preprocess_arguments, ) -from ...utils import TensorType, is_vision_available, logging +from ...utils import TensorType, logging logger = logging.get_logger(__name__) -if is_vision_available(): - from PIL import Image - - -def make_batched_videos(videos) -> List[VideoInput]: - if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos - - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if isinstance(videos[0], Image.Image): - return [videos] - elif len(videos[0].shape) == 4: - return [list(video) for video in videos] - - elif is_valid_image(videos) and len(videos.shape) == 4: - return [list(videos)] - - raise ValueError(f"Could not make batched video from {videos}") - - class LlavaNextVideoImageProcessor(BaseImageProcessor): r""" Constructs a LLaVa-NeXT-Video video processor. Based on [`CLIPImageProcessor`] with incorporation of processing each video frame. @@ -212,7 +192,7 @@ class LlavaNextVideoImageProcessor(BaseImageProcessor): do_convert_rgb: bool = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> Image.Image: + ) -> list[np.ndarray]: """ Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`. diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py index 75581d25ae..2243517504 100644 --- a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py @@ -36,7 +36,7 @@ from ...image_utils import ( get_image_size, infer_channel_dimension_format, is_scaled_image, - is_valid_image, + make_flat_list_of_images, to_numpy_array, valid_images, validate_preprocess_arguments, @@ -51,30 +51,6 @@ if is_vision_available(): from PIL import Image -# Copied from transformers.models.llava_next.image_processing_llava_next.make_batched_images -def make_batched_images(images) -> List[List[ImageInput]]: - """ - Accepts images in list or nested list format, and makes a list of images for preprocessing. - - Args: - images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): - The input image. - - Returns: - list: A list of images. - """ - if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): - return [img for img_list in images for img in img_list] - - elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): - return images - - elif is_valid_image(images): - return [images] - - raise ValueError(f"Could not make batched video from {images}") - - # Copied from transformers.models.llava_next.image_processing_llava_next.divide_to_patches def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]: """ @@ -632,7 +608,7 @@ class LlavaOnevisionImageProcessor(BaseImageProcessor): do_pad = do_pad if do_pad is not None else self.do_pad do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb - images = make_batched_images(images) + images = make_flat_list_of_images(images) if not valid_images(images): raise ValueError( diff --git a/src/transformers/models/llava_onevision/video_processing_llava_onevision.py b/src/transformers/models/llava_onevision/video_processing_llava_onevision.py index a5aa42688e..743e9f2df6 100644 --- a/src/transformers/models/llava_onevision/video_processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/video_processing_llava_onevision.py @@ -16,6 +16,8 @@ from typing import Dict, List, Optional, Union +import numpy as np + from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from ...image_transforms import ( convert_to_rgb, @@ -31,37 +33,17 @@ from ...image_utils import ( VideoInput, infer_channel_dimension_format, is_scaled_image, - is_valid_image, + make_batched_videos, to_numpy_array, valid_images, validate_preprocess_arguments, ) -from ...utils import TensorType, is_vision_available, logging +from ...utils import TensorType, logging logger = logging.get_logger(__name__) -if is_vision_available(): - from PIL import Image - - -def make_batched_videos(videos) -> List[VideoInput]: - if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos - - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if isinstance(videos[0], Image.Image) or len(videos[0].shape) == 3: - return [videos] - elif len(videos[0].shape) == 4: - return [list(video) for video in videos] - - elif is_valid_image(videos) and len(videos.shape) == 4: - return [list(videos)] - - raise ValueError(f"Could not make batched video from {videos}") - - class LlavaOnevisionVideoProcessor(BaseImageProcessor): r""" Constructs a LLaVa-Onevisino-Video video processor. Based on [`SiglipImageProcessor`] with incorporation of processing each video frame. @@ -138,7 +120,7 @@ class LlavaOnevisionVideoProcessor(BaseImageProcessor): do_convert_rgb: bool = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> Image.Image: + ) -> list[np.ndarray]: """ Args: images (`ImageInput`): diff --git a/src/transformers/models/mllama/image_processing_mllama.py b/src/transformers/models/mllama/image_processing_mllama.py index 3c85258967..9ff077f150 100644 --- a/src/transformers/models/mllama/image_processing_mllama.py +++ b/src/transformers/models/mllama/image_processing_mllama.py @@ -33,8 +33,8 @@ from ...image_utils import ( ImageInput, PILImageResampling, infer_channel_dimension_format, - is_valid_image, is_vision_available, + make_nested_list_of_images, to_numpy_array, validate_preprocess_arguments, ) @@ -514,42 +514,6 @@ def convert_to_rgb(image: ImageInput) -> ImageInput: return alpha_composite -# Modified from transformers.models.idefics2.image_processing_idefics2.make_list_of_images -def make_list_of_images(images: ImageInput) -> List[List[Optional[np.ndarray]]]: - """ - Convert a single image or a list of images to a list of numpy arrays. - - Args: - images (`ImageInput`): - A single image or a list of images. - - Returns: - A list of numpy arrays. - """ - # If it's a single image, convert it to a list of lists - if is_valid_image(images): - output_images = [[images]] - # If it's a list of images, it's a single batch, so convert it to a list of lists - elif isinstance(images, (list, tuple)) and is_valid_list_of_images(images): - output_images = [images] - # If it's a list of batches, it's already in the right format - elif ( - isinstance(images, (list, tuple)) - and all(isinstance(images_i, (list, tuple)) for images_i in images) - and any(is_valid_list_of_images(images_i) for images_i in images) - ): - output_images = images - else: - raise ValueError( - "Invalid input type. Must be a single image, a list of images, or a list of batches of images." - ) - return output_images - - -def is_valid_list_of_images(images: List): - return images and all(is_valid_image(image) for image in images) - - def _validate_size(size: Dict[str, int]) -> None: if not ("height" in size and "width" in size): raise ValueError(f"Argument `size` must be a dictionary with keys 'height' and 'width'. Got: {size}") @@ -726,7 +690,7 @@ class MllamaImageProcessor(BaseImageProcessor): # extra validation _validate_mllama_preprocess_arguments(do_resize, size, do_pad, max_image_tiles) - images_list = make_list_of_images(images) + images_list = make_nested_list_of_images(images) if self.do_convert_rgb: images_list = [[convert_to_rgb(image) for image in images] for images in images_list] diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py index 5905f3313f..4e8f788cf7 100644 --- a/src/transformers/models/mllama/processing_mllama.py +++ b/src/transformers/models/mllama/processing_mllama.py @@ -20,16 +20,13 @@ from typing import List, Optional, Union import numpy as np from ...feature_extraction_utils import BatchFeature -from ...image_utils import ImageInput +from ...image_utils import ImageInput, make_nested_list_of_images from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import ( PreTokenizedInput, TextInput, ) -# TODO: Can we do it that way or its better include as "Copied from ..." -from .image_processing_mllama import make_list_of_images - class MllamaImagesKwargs(ImagesKwargs, total=False): max_image_tiles: Optional[int] @@ -292,7 +289,7 @@ class MllamaProcessor(ProcessorMixin): n_images_in_images = [0] if images is not None: - images = make_list_of_images(images) + images = make_nested_list_of_images(images) n_images_in_images = [len(sample) for sample in images] if text is not None: diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py index f2d0afed94..ac4b98e70b 100644 --- a/src/transformers/models/paligemma/processing_paligemma.py +++ b/src/transformers/models/paligemma/processing_paligemma.py @@ -19,7 +19,7 @@ Processor class for PaliGemma. from typing import List, Optional, Union from ...feature_extraction_utils import BatchFeature -from ...image_utils import ImageInput, is_valid_image +from ...image_utils import ImageInput, is_valid_image, make_flat_list_of_images from ...processing_utils import ( ImagesKwargs, ProcessingKwargs, @@ -99,30 +99,6 @@ def build_string_from_input(prompt, bos_token, image_seq_len, image_token, num_i return f"{image_token * image_seq_len * num_images}{bos_token}{prompt}\n" -# Copied from transformers.models.llava_next.image_processing_llava_next.make_batched_images -def make_batched_images(images) -> List[List[ImageInput]]: - """ - Accepts images in list or nested list format, and makes a list of images for preprocessing. - - Args: - images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): - The input image. - - Returns: - list: A list of images. - """ - if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): - return [img for img_list in images for img in img_list] - - elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): - return images - - elif is_valid_image(images): - return [images] - - raise ValueError(f"Could not make batched video from {images}") - - class PaliGemmaProcessor(ProcessorMixin): r""" Constructs a PaliGemma processor which wraps a PaliGemma image processor and a PaliGemma tokenizer into a single processor. @@ -297,7 +273,7 @@ class PaliGemmaProcessor(ProcessorMixin): ) for prompt, image_list in zip(text, images) ] - images = make_batched_images(images) + images = make_flat_list_of_images(images) else: expanded_samples = [] for sample in text: diff --git a/src/transformers/models/qwen2_5_vl/image_processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/image_processing_qwen2_5_vl.py index 7101ae6035..168995f344 100644 --- a/src/transformers/models/qwen2_5_vl/image_processing_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/image_processing_qwen2_5_vl.py @@ -41,61 +41,19 @@ from ...image_utils import ( get_image_size, infer_channel_dimension_format, is_scaled_image, - is_valid_image, + make_batched_videos, + make_flat_list_of_images, make_list_of_images, to_numpy_array, valid_images, validate_preprocess_arguments, ) -from ...utils import TensorType, is_vision_available, logging - - -if is_vision_available(): - from PIL import Image +from ...utils import TensorType, logging logger = logging.get_logger(__name__) -def make_batched_images(images) -> List[List[ImageInput]]: - """ - Accepts images in list or nested list format, and makes a list of images for preprocessing. - - Args: - images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): - The input image. - - Returns: - list: A list of images. - """ - if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): - return [img for img_list in images for img in img_list] - - elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): - return images - - elif is_valid_image(images): - return [images] - - raise ValueError(f"Could not make batched images from {images}") - - -def make_batched_videos(videos) -> List[VideoInput]: - if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos - - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if isinstance(videos[0], Image.Image): - return [videos] - elif len(videos[0].shape) == 4: - return [list(video) for video in videos] - - elif is_valid_image(videos) and len(videos.shape) == 4: - return [list(videos)] - - raise ValueError(f"Could not make batched video from {videos}") - - def smart_resize( height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280 ): @@ -398,7 +356,7 @@ class Qwen2_5_VLImageProcessor(BaseImageProcessor): do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb if images is not None: - images = make_batched_images(images) + images = make_flat_list_of_images(images) if videos is not None: videos = make_batched_videos(videos) diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py index b8656a9103..51b657327c 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py @@ -40,62 +40,19 @@ from ...image_utils import ( get_image_size, infer_channel_dimension_format, is_scaled_image, - is_valid_image, + make_batched_videos, + make_flat_list_of_images, make_list_of_images, to_numpy_array, valid_images, validate_preprocess_arguments, ) -from ...utils import TensorType, is_vision_available, logging +from ...utils import TensorType, logging logger = logging.get_logger(__name__) -if is_vision_available(): - from PIL import Image - - -def make_batched_images(images) -> List[List[ImageInput]]: - """ - Accepts images in list or nested list format, and makes a list of images for preprocessing. - - Args: - images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): - The input image. - - Returns: - list: A list of images. - """ - if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): - return [img for img_list in images for img in img_list] - - elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): - return images - - elif is_valid_image(images): - return [images] - - raise ValueError(f"Could not make batched images from {images}") - - -# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos -def make_batched_videos(videos) -> List[VideoInput]: - if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos - - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if isinstance(videos[0], Image.Image): - return [videos] - elif len(videos[0].shape) == 4: - return [list(video) for video in videos] - - elif is_valid_image(videos) and len(videos.shape) == 4: - return [list(videos)] - - raise ValueError(f"Could not make batched video from {videos}") - - def smart_resize( height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280 ): @@ -392,7 +349,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor): do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb if images is not None: - images = make_batched_images(images) + images = make_flat_list_of_images(images) if videos is not None: videos = make_batched_videos(videos) diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py index a08b838fac..2283da6097 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py @@ -39,6 +39,8 @@ from ...image_utils import ( get_image_size, get_image_type, infer_channel_dimension_format, + make_batched_videos, + make_flat_list_of_images, make_list_of_images, valid_images, validate_preprocess_arguments, @@ -51,7 +53,7 @@ from ...utils import ( is_vision_available, logging, ) -from .image_processing_qwen2_vl import make_batched_images, make_batched_videos, smart_resize +from .image_processing_qwen2_vl import smart_resize if is_torch_available(): @@ -350,7 +352,7 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast): image_std = tuple(image_std) if isinstance(image_std, list) else image_std if images is not None: - images = make_batched_images(images) + images = make_flat_list_of_images(images) if videos is not None: videos = make_batched_videos(videos) diff --git a/src/transformers/models/siglip/image_processing_siglip.py b/src/transformers/models/siglip/image_processing_siglip.py index b87adb7492..d582687806 100644 --- a/src/transformers/models/siglip/image_processing_siglip.py +++ b/src/transformers/models/siglip/image_processing_siglip.py @@ -30,7 +30,7 @@ from ...image_utils import ( PILImageResampling, infer_channel_dimension_format, is_scaled_image, - make_list_of_images, + make_flat_list_of_images, to_numpy_array, valid_images, validate_preprocess_arguments, @@ -181,7 +181,7 @@ class SiglipImageProcessor(BaseImageProcessor): image_std = image_std if image_std is not None else self.image_std do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb - images = make_list_of_images(images) + images = make_flat_list_of_images(images) if not valid_images(images): raise ValueError( diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py index 4e97834617..dbb1054857 100644 --- a/src/transformers/models/video_llava/image_processing_video_llava.py +++ b/src/transformers/models/video_llava/image_processing_video_llava.py @@ -34,38 +34,18 @@ from ...image_utils import ( VideoInput, infer_channel_dimension_format, is_scaled_image, - is_valid_image, + make_batched_videos, make_list_of_images, to_numpy_array, valid_images, validate_preprocess_arguments, ) -from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging +from ...utils import TensorType, filter_out_non_signature_kwargs, logging logger = logging.get_logger(__name__) -if is_vision_available(): - import PIL - - -def make_batched_videos(videos) -> List[VideoInput]: - if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos - - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if isinstance(videos[0], PIL.Image.Image): - return [videos] - elif len(videos[0].shape) == 4: - return [list(video) for video in videos] - - elif is_valid_image(videos) and len(videos.shape) == 4: - return [list(videos)] - - raise ValueError(f"Could not make batched video from {videos}") - - class VideoLlavaImageProcessor(BaseImageProcessor): r""" Constructs a CLIP image processor. @@ -208,7 +188,7 @@ class VideoLlavaImageProcessor(BaseImageProcessor): return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> PIL.Image.Image: + ) -> BatchFeature: """ Preprocess an image or batch of images. diff --git a/tests/models/colpali/test_modeling_colpali.py b/tests/models/colpali/test_modeling_colpali.py index 6f3ce6b96b..5e94ecaab9 100644 --- a/tests/models/colpali/test_modeling_colpali.py +++ b/tests/models/colpali/test_modeling_colpali.py @@ -26,7 +26,6 @@ from tests.test_configuration_common import ConfigTester from tests.test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from transformers import ( is_torch_available, - is_vision_available, ) from transformers.models.colpali.configuration_colpali import ColPaliConfig from transformers.models.colpali.modeling_colpali import ColPaliForRetrieval, ColPaliForRetrievalOutput @@ -43,9 +42,6 @@ from transformers.testing_utils import ( if is_torch_available(): import torch -if is_vision_available(): - pass - class ColPaliForRetrievalModelTester: def __init__( diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index ef95aab8bf..76c5c11de2 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -39,7 +39,7 @@ from transformers.testing_utils import ( slow, torch_device, ) -from transformers.utils import is_torch_available, is_vision_available +from transformers.utils import is_torch_available from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -58,10 +58,6 @@ if is_torch_available(): from transformers import InstructBlipVideoForConditionalGeneration, InstructBlipVideoVisionModel -if is_vision_available(): - pass - - class InstructBlipVideoVisionModelTester: def __init__( self, diff --git a/tests/models/pixtral/test_modeling_pixtral.py b/tests/models/pixtral/test_modeling_pixtral.py index f254d9eecd..173ddc1a13 100644 --- a/tests/models/pixtral/test_modeling_pixtral.py +++ b/tests/models/pixtral/test_modeling_pixtral.py @@ -20,7 +20,6 @@ from transformers import ( PixtralVisionConfig, PixtralVisionModel, is_torch_available, - is_vision_available, ) from transformers.testing_utils import ( require_torch, @@ -35,10 +34,6 @@ if is_torch_available(): import torch -if is_vision_available(): - pass - - class PixtralVisionModelTester: def __init__( self, diff --git a/tests/utils/test_image_utils.py b/tests/utils/test_image_utils.py index 1fa84aa5db..d4ce1435a1 100644 --- a/tests/utils/test_image_utils.py +++ b/tests/utils/test_image_utils.py @@ -28,7 +28,14 @@ from requests import ConnectTimeout, ReadTimeout from tests.pipelines.test_pipelines_document_question_answering import INVOICE_URL from transformers import is_torch_available, is_vision_available -from transformers.image_utils import ChannelDimension, get_channel_dimension_axis, make_list_of_images +from transformers.image_utils import ( + ChannelDimension, + get_channel_dimension_axis, + make_batched_videos, + make_flat_list_of_images, + make_list_of_images, + make_nested_list_of_images, +) from transformers.testing_utils import is_flaky, require_torch, require_vision @@ -115,6 +122,21 @@ class ImageFeatureExtractionTester(unittest.TestCase): self.assertEqual(array5.shape, (3, 16, 32)) self.assertTrue(np.array_equal(array5, array1)) + def test_make_list_of_images_pil(self): + # Test a single image is converted to a list of 1 image + pil_image = get_random_image(16, 32) + images_list = make_list_of_images(pil_image) + self.assertIsInstance(images_list, list) + self.assertEqual(len(images_list), 1) + self.assertIsInstance(images_list[0], PIL.Image.Image) + + # Test a list of images is not modified + images = [get_random_image(16, 32) for _ in range(4)] + images_list = make_list_of_images(images) + self.assertIsInstance(images_list, list) + self.assertEqual(len(images_list), 4) + self.assertIsInstance(images_list[0], PIL.Image.Image) + def test_make_list_of_images_numpy(self): # Test a single image is converted to a list of 1 image images = np.random.randint(0, 256, (16, 32, 3)) @@ -167,6 +189,323 @@ class ImageFeatureExtractionTester(unittest.TestCase): self.assertTrue(np.array_equal(images_list[0], images[0])) self.assertIsInstance(images_list, list) + def test_make_flat_list_of_images_pil(self): + # Test a single image is converted to a list of 1 image + pil_image = get_random_image(16, 32) + images_list = make_flat_list_of_images(pil_image) + self.assertIsInstance(images_list, list) + self.assertEqual(len(images_list), 1) + self.assertIsInstance(images_list[0], PIL.Image.Image) + + # Test a list of images is not modified + images = [get_random_image(16, 32) for _ in range(4)] + images_list = make_flat_list_of_images(images) + self.assertIsInstance(images_list, list) + self.assertEqual(len(images_list), 4) + self.assertIsInstance(images_list[0], PIL.Image.Image) + + # Test a nested list of images is flattened + images = [[get_random_image(16, 32) for _ in range(2)] for _ in range(2)] + images_list = make_flat_list_of_images(images) + self.assertIsInstance(images_list, list) + self.assertEqual(len(images_list), 4) + self.assertIsInstance(images_list[0], PIL.Image.Image) + + def test_make_flat_list_of_images_numpy(self): + # Test a single image is converted to a list of 1 image + images = np.random.randint(0, 256, (16, 32, 3)) + images_list = make_flat_list_of_images(images) + self.assertEqual(len(images_list), 1) + self.assertTrue(np.array_equal(images_list[0], images)) + self.assertIsInstance(images_list, list) + + # Test a 4d array of images is changed to a list of images + images = np.random.randint(0, 256, (4, 16, 32, 3)) + images_list = make_flat_list_of_images(images) + self.assertEqual(len(images_list), 4) + self.assertIsInstance(images_list, list) + self.assertIsInstance(images_list[0], np.ndarray) + self.assertTrue(np.array_equal(images_list[0], images[0])) + + # Test a list of images is not modified + images = [np.random.randint(0, 256, (16, 32, 3)) for _ in range(4)] + images_list = make_flat_list_of_images(images) + self.assertEqual(len(images_list), 4) + self.assertTrue(np.array_equal(images_list[0], images[0])) + self.assertIsInstance(images_list, list) + + # Test list of 4d array images is flattened + images = [np.random.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)] + images_list = make_flat_list_of_images(images) + self.assertEqual(len(images_list), 8) + self.assertTrue(np.array_equal(images_list[0], images[0][0])) + self.assertIsInstance(images_list, list) + self.assertIsInstance(images_list[0], np.ndarray) + + # Test nested list of images is flattened + images = [[np.random.randint(0, 256, (16, 32, 3)) for _ in range(2)] for _ in range(2)] + images_list = make_flat_list_of_images(images) + self.assertEqual(len(images_list), 4) + self.assertTrue(np.array_equal(images_list[0], images[0][0])) + self.assertIsInstance(images_list, list) + + @require_torch + def test_make_flat_list_of_images_torch(self): + # Test a single image is converted to a list of 1 image + images = torch.randint(0, 256, (16, 32, 3)) + images_list = make_flat_list_of_images(images) + self.assertEqual(len(images_list), 1) + self.assertTrue(np.array_equal(images_list[0], images)) + self.assertIsInstance(images_list, list) + + # Test a 4d tensors of images is changed to a list of images + images = torch.randint(0, 256, (4, 16, 32, 3)) + images_list = make_flat_list_of_images(images) + self.assertEqual(len(images_list), 4) + self.assertIsInstance(images_list, list) + self.assertIsInstance(images_list[0], torch.Tensor) + self.assertTrue(np.array_equal(images_list[0], images[0])) + + # Test a list of images is not modified + images = [torch.randint(0, 256, (16, 32, 3)) for _ in range(4)] + images_list = make_flat_list_of_images(images) + self.assertEqual(len(images_list), 4) + self.assertTrue(np.array_equal(images_list[0], images[0])) + self.assertIsInstance(images_list, list) + + # Test list of 4d tensors of imagess is flattened + images = [torch.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)] + images_list = make_flat_list_of_images(images) + self.assertEqual(len(images_list), 8) + self.assertTrue(np.array_equal(images_list[0], images[0][0])) + self.assertIsInstance(images_list, list) + self.assertIsInstance(images_list[0], torch.Tensor) + + # Test nested list of images is flattened + images = [[torch.randint(0, 256, (16, 32, 3)) for _ in range(2)] for _ in range(2)] + images_list = make_flat_list_of_images(images) + self.assertEqual(len(images_list), 4) + self.assertTrue(np.array_equal(images_list[0], images[0][0])) + self.assertIsInstance(images_list, list) + + def test_make_nested_list_of_images_pil(self): + # Test a single image is converted to a nested list of 1 image + pil_image = get_random_image(16, 32) + images_list = make_nested_list_of_images(pil_image) + self.assertIsInstance(images_list[0], list) + self.assertEqual(len(images_list[0]), 1) + self.assertIsInstance(images_list[0][0], PIL.Image.Image) + + # Test a list of images is converted to a nested list of images + images = [get_random_image(16, 32) for _ in range(4)] + images_list = make_nested_list_of_images(images) + self.assertIsInstance(images_list[0], list) + self.assertEqual(len(images_list), 1) + self.assertEqual(len(images_list[0]), 4) + self.assertIsInstance(images_list[0][0], PIL.Image.Image) + + # Test a nested list of images is not modified + images = [[get_random_image(16, 32) for _ in range(2)] for _ in range(2)] + images_list = make_nested_list_of_images(images) + self.assertIsInstance(images_list[0], list) + self.assertEqual(len(images_list), 2) + self.assertEqual(len(images_list[0]), 2) + self.assertIsInstance(images_list[0][0], PIL.Image.Image) + + def test_make_nested_list_of_images_numpy(self): + # Test a single image is converted to a nested list of 1 image + images = np.random.randint(0, 256, (16, 32, 3)) + images_list = make_nested_list_of_images(images) + self.assertIsInstance(images_list[0], list) + self.assertEqual(len(images_list), 1) + self.assertTrue(np.array_equal(images_list[0][0], images)) + + # Test a 4d array of images is converted to a nested list of images + images = np.random.randint(0, 256, (4, 16, 32, 3)) + images_list = make_nested_list_of_images(images) + self.assertIsInstance(images_list[0], list) + self.assertIsInstance(images_list[0][0], np.ndarray) + self.assertEqual(len(images_list), 1) + self.assertEqual(len(images_list[0]), 4) + self.assertTrue(np.array_equal(images_list[0][0], images[0])) + + # Test a list of images is converted to a nested list of images + images = [np.random.randint(0, 256, (16, 32, 3)) for _ in range(4)] + images_list = make_nested_list_of_images(images) + self.assertIsInstance(images_list[0], list) + self.assertEqual(len(images_list), 1) + self.assertEqual(len(images_list[0]), 4) + self.assertTrue(np.array_equal(images_list[0][0], images[0])) + + # Test a nested list of images is left unchanged + images = [[np.random.randint(0, 256, (16, 32, 3)) for _ in range(2)] for _ in range(2)] + images_list = make_nested_list_of_images(images) + self.assertIsInstance(images_list[0], list) + self.assertEqual(len(images_list), 2) + self.assertEqual(len(images_list[0]), 2) + self.assertTrue(np.array_equal(images_list[0][0], images[0][0])) + + # Test a list of 4d array images is converted to a nested list of images + images = [np.random.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)] + images_list = make_nested_list_of_images(images) + self.assertIsInstance(images_list[0], list) + self.assertIsInstance(images_list[0][0], np.ndarray) + self.assertEqual(len(images_list), 2) + self.assertEqual(len(images_list[0]), 4) + self.assertTrue(np.array_equal(images_list[0][0], images[0][0])) + + @require_torch + def test_make_nested_list_of_images_torch(self): + # Test a single image is converted to a nested list of 1 image + images = torch.randint(0, 256, (16, 32, 3)) + images_list = make_nested_list_of_images(images) + self.assertIsInstance(images_list[0], list) + self.assertEqual(len(images_list[0]), 1) + self.assertTrue(np.array_equal(images_list[0][0], images)) + + # Test a 4d tensor of images is converted to a nested list of images + images = torch.randint(0, 256, (4, 16, 32, 3)) + images_list = make_nested_list_of_images(images) + self.assertIsInstance(images_list[0], list) + self.assertIsInstance(images_list[0][0], torch.Tensor) + self.assertEqual(len(images_list), 1) + self.assertEqual(len(images_list[0]), 4) + self.assertTrue(np.array_equal(images_list[0][0], images[0])) + + # Test a list of images is converted to a nested list of images + images = [torch.randint(0, 256, (16, 32, 3)) for _ in range(4)] + images_list = make_nested_list_of_images(images) + self.assertIsInstance(images_list[0], list) + self.assertEqual(len(images_list), 1) + self.assertEqual(len(images_list[0]), 4) + self.assertTrue(np.array_equal(images_list[0][0], images[0])) + + # Test a nested list of images is left unchanged + images = [[torch.randint(0, 256, (16, 32, 3)) for _ in range(2)] for _ in range(2)] + images_list = make_nested_list_of_images(images) + self.assertIsInstance(images_list[0], list) + self.assertEqual(len(images_list), 2) + self.assertEqual(len(images_list[0]), 2) + self.assertTrue(np.array_equal(images_list[0][0], images[0][0])) + + # Test a list of 4d tensor images is converted to a nested list of images + images = [torch.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)] + images_list = make_nested_list_of_images(images) + self.assertIsInstance(images_list[0], list) + self.assertIsInstance(images_list[0][0], torch.Tensor) + self.assertEqual(len(images_list), 2) + self.assertEqual(len(images_list[0]), 4) + self.assertTrue(np.array_equal(images_list[0][0], images[0][0])) + + def test_make_batched_videos_pil(self): + # Test a single image is converted to a list of 1 video with 1 frame + pil_image = get_random_image(16, 32) + videos_list = make_batched_videos(pil_image) + self.assertIsInstance(videos_list[0], list) + self.assertEqual(len(videos_list[0]), 1) + self.assertIsInstance(videos_list[0][0], PIL.Image.Image) + + # Test a list of images is converted to a list of 1 video + images = [get_random_image(16, 32) for _ in range(4)] + videos_list = make_batched_videos(images) + self.assertIsInstance(videos_list[0], list) + self.assertEqual(len(videos_list), 1) + self.assertEqual(len(videos_list[0]), 4) + self.assertIsInstance(videos_list[0][0], PIL.Image.Image) + + # Test a nested list of images is not modified + images = [[get_random_image(16, 32) for _ in range(2)] for _ in range(2)] + videos_list = make_nested_list_of_images(images) + self.assertIsInstance(videos_list[0], list) + self.assertEqual(len(videos_list), 2) + self.assertEqual(len(videos_list[0]), 2) + self.assertIsInstance(videos_list[0][0], PIL.Image.Image) + + def test_make_batched_videos_numpy(self): + # Test a single image is converted to a list of 1 video with 1 frame + images = np.random.randint(0, 256, (16, 32, 3)) + videos_list = make_nested_list_of_images(images) + self.assertIsInstance(videos_list[0], list) + self.assertEqual(len(videos_list), 1) + self.assertTrue(np.array_equal(videos_list[0][0], images)) + + # Test a 4d array of images is converted to a a list of 1 video + images = np.random.randint(0, 256, (4, 16, 32, 3)) + videos_list = make_nested_list_of_images(images) + self.assertIsInstance(videos_list[0], list) + self.assertIsInstance(videos_list[0][0], np.ndarray) + self.assertEqual(len(videos_list), 1) + self.assertEqual(len(videos_list[0]), 4) + self.assertTrue(np.array_equal(videos_list[0][0], images[0])) + + # Test a list of images is converted to a list of videos + images = [np.random.randint(0, 256, (16, 32, 3)) for _ in range(4)] + videos_list = make_nested_list_of_images(images) + self.assertIsInstance(videos_list[0], list) + self.assertEqual(len(videos_list), 1) + self.assertEqual(len(videos_list[0]), 4) + self.assertTrue(np.array_equal(videos_list[0][0], images[0])) + + # Test a nested list of images is left unchanged + images = [[np.random.randint(0, 256, (16, 32, 3)) for _ in range(2)] for _ in range(2)] + videos_list = make_nested_list_of_images(images) + self.assertIsInstance(videos_list[0], list) + self.assertEqual(len(videos_list), 2) + self.assertEqual(len(videos_list[0]), 2) + self.assertTrue(np.array_equal(videos_list[0][0], images[0][0])) + + # Test a list of 4d array images is converted to a list of videos + images = [np.random.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)] + videos_list = make_nested_list_of_images(images) + self.assertIsInstance(videos_list[0], list) + self.assertIsInstance(videos_list[0][0], np.ndarray) + self.assertEqual(len(videos_list), 2) + self.assertEqual(len(videos_list[0]), 4) + self.assertTrue(np.array_equal(videos_list[0][0], images[0][0])) + + @require_torch + def test_make_batched_videos_torch(self): + # Test a single image is converted to a list of 1 video with 1 frame + images = torch.randint(0, 256, (16, 32, 3)) + videos_list = make_nested_list_of_images(images) + self.assertIsInstance(videos_list[0], list) + self.assertEqual(len(videos_list[0]), 1) + self.assertTrue(np.array_equal(videos_list[0][0], images)) + + # Test a 4d tensor of images is converted to a list of 1 video + images = torch.randint(0, 256, (4, 16, 32, 3)) + videos_list = make_nested_list_of_images(images) + self.assertIsInstance(videos_list[0], list) + self.assertIsInstance(videos_list[0][0], torch.Tensor) + self.assertEqual(len(videos_list), 1) + self.assertEqual(len(videos_list[0]), 4) + self.assertTrue(np.array_equal(videos_list[0][0], images[0])) + + # Test a list of images is converted to a list of videos + images = [torch.randint(0, 256, (16, 32, 3)) for _ in range(4)] + videos_list = make_nested_list_of_images(images) + self.assertIsInstance(videos_list[0], list) + self.assertEqual(len(videos_list), 1) + self.assertEqual(len(videos_list[0]), 4) + self.assertTrue(np.array_equal(videos_list[0][0], images[0])) + + # Test a nested list of images is left unchanged + images = [[torch.randint(0, 256, (16, 32, 3)) for _ in range(2)] for _ in range(2)] + videos_list = make_nested_list_of_images(images) + self.assertIsInstance(videos_list[0], list) + self.assertEqual(len(videos_list), 2) + self.assertEqual(len(videos_list[0]), 2) + self.assertTrue(np.array_equal(videos_list[0][0], images[0][0])) + + # Test a list of 4d tensor images is converted to a list of videos + images = [torch.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)] + videos_list = make_nested_list_of_images(images) + self.assertIsInstance(videos_list[0], list) + self.assertIsInstance(videos_list[0][0], torch.Tensor) + self.assertEqual(len(videos_list), 2) + self.assertEqual(len(videos_list[0]), 4) + self.assertTrue(np.array_equal(videos_list[0][0], images[0][0])) + @require_torch def test_conversion_torch_to_array(self): feature_extractor = ImageFeatureExtractionMixin()