Add support for nested images to LLava and VipLLava (#35558)

* move make_flat_list_of_images and make_batched_videos to image_utils

* remove unnecessary is_vision_available

* move make_nested_list_of_images to image_utils

* fix fast pixtral image processor

* fix import mllama

* fix make_nested_list_of_images

* add tests

* convert 4d arrays/tensors to list

* add test_make_batched_videos

* add support nested batch of videos

* fix image processing qwen2vl
This commit is contained in:
Yoni Gozlan
2025-01-30 16:49:20 -05:00
committed by GitHub
parent e4227eb4d4
commit d7188ba600
27 changed files with 506 additions and 485 deletions

View File

@@ -158,6 +158,10 @@ def is_valid_image(img):
return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img) or is_tf_tensor(img) or is_jax_tensor(img)
def is_valid_list_of_images(images: List):
return images and all(is_valid_image(image) for image in images)
def valid_images(imgs):
# If we have an list of images, make sure every image is valid
if isinstance(imgs, (list, tuple)):
@@ -189,7 +193,7 @@ def is_scaled_image(image: np.ndarray) -> bool:
def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]:
"""
Ensure that the input is a list of images. If the input is a single image, it is converted to a list of length 1.
Ensure that the output is a list of images. If the input is a single image, it is converted to a list of length 1.
If the input is a batch of images, it is converted to a list of images.
Args:
@@ -203,7 +207,7 @@ def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]:
return images
# Either the input is a single image, in which case we create a list of length 1
if isinstance(images, PIL.Image.Image):
if is_pil_image(images):
# PIL images are never batched
return [images]
@@ -226,6 +230,108 @@ def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]:
)
def make_flat_list_of_images(
images: Union[List[ImageInput], ImageInput],
) -> ImageInput:
"""
Ensure that the output is a flat list of images. If the input is a single image, it is converted to a list of length 1.
If the input is a nested list of images, it is converted to a flat list of images.
Args:
images (`Union[List[ImageInput], ImageInput]`):
The input image.
Returns:
list: A list of images or a 4d array of images.
"""
# If the input is a nested list of images, we flatten it
if (
isinstance(images, (list, tuple))
and all(isinstance(images_i, (list, tuple)) for images_i in images)
and all(is_valid_list_of_images(images_i) for images_i in images)
):
return [img for img_list in images for img in img_list]
if isinstance(images, (list, tuple)) and is_valid_list_of_images(images):
if is_pil_image(images[0]) or images[0].ndim == 3:
return images
if images[0].ndim == 4:
return [img for img_list in images for img in img_list]
if is_valid_image(images):
if is_pil_image(images) or images.ndim == 3:
return [images]
if images.ndim == 4:
return list(images)
raise ValueError(f"Could not make a flat list of images from {images}")
def make_nested_list_of_images(
images: Union[List[ImageInput], ImageInput],
) -> ImageInput:
"""
Ensure that the output is a nested list of images.
Args:
images (`Union[List[ImageInput], ImageInput]`):
The input image.
Returns:
list: A list of list of images or a list of 4d array of images.
"""
# If it's a list of batches, it's already in the right format
if (
isinstance(images, (list, tuple))
and all(isinstance(images_i, (list, tuple)) for images_i in images)
and all(is_valid_list_of_images(images_i) for images_i in images)
):
return images
# If it's a list of images, it's a single batch, so convert it to a list of lists
if isinstance(images, (list, tuple)) and is_valid_list_of_images(images):
if is_pil_image(images[0]) or images[0].ndim == 3:
return [images]
if images[0].ndim == 4:
return [list(image) for image in images]
# If it's a single image, convert it to a list of lists
if is_valid_image(images):
if is_pil_image(images) or images.ndim == 3:
return [[images]]
if images.ndim == 4:
return [list(images)]
raise ValueError("Invalid input type. Must be a single image, a list of images, or a list of batches of images.")
def make_batched_videos(videos) -> VideoInput:
"""
Ensure that the input is a list of videos.
Args:
videos (`VideoInput`):
Video or videos to turn into a list of videos.
Returns:
list: A list of videos.
"""
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
# case 1: nested batch of videos so we flatten it
if not is_pil_image(videos[0][0]) and videos[0][0].ndim == 4:
videos = [video for batch_list in videos for video in batch_list]
# case 2: list of videos represented as list of video frames
return videos
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
if is_pil_image(videos[0]) or videos[0].ndim == 3:
return [videos]
elif videos[0].ndim == 4:
return [list(video) for video in videos]
elif is_valid_image(videos):
if is_pil_image(videos) or videos.ndim == 3:
return [[videos]]
elif videos.ndim == 4:
return [list(videos)]
raise ValueError(f"Could not make batched video from {videos}")
def to_numpy_array(img) -> np.ndarray:
if not is_valid_image(img):
raise ValueError(f"Invalid image type: {type(img)}")

View File

@@ -31,7 +31,7 @@ from ...image_utils import (
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
is_valid_image,
make_flat_list_of_images,
to_numpy_array,
valid_images,
validate_preprocess_arguments,
@@ -39,29 +39,6 @@ from ...image_utils import (
from ...utils import TensorType
def make_batched_images(images) -> List[List[ImageInput]]:
"""
Accepts images in list or nested list format, and makes a list of images for preprocessing.
Args:
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
The input image.
Returns:
list: A list of images.
"""
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
return [img for img_list in images for img in img_list]
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
return images
elif is_valid_image(images):
return [images]
raise ValueError(f"Could not make batched video from {images}")
def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]:
"""
Divides an image into patches of a specified size.
@@ -244,7 +221,7 @@ class AriaImageProcessor(BaseImageProcessor):
if max_image_size not in [490, 980]:
raise ValueError("max_image_size must be either 490 or 980")
images = make_batched_images(images)
images = make_flat_list_of_images(images)
if not valid_images(images):
raise ValueError(

View File

@@ -28,6 +28,7 @@ from ...image_utils import (
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
make_flat_list_of_images,
to_numpy_array,
valid_images,
validate_preprocess_arguments,
@@ -58,7 +59,7 @@ from ..llama.modeling_llama import (
LlamaRMSNorm,
)
from ..llava.modeling_llava import LlavaCausalLMOutputWithPast
from ..llava_next.image_processing_llava_next import divide_to_patches, make_batched_images
from ..llava_next.image_processing_llava_next import divide_to_patches
logger = logging.get_logger(__name__)
@@ -609,7 +610,7 @@ class AriaImageProcessor(BaseImageProcessor):
if max_image_size not in [490, 980]:
raise ValueError("max_image_size must be either 490 or 980")
images = make_batched_images(images)
images = make_flat_list_of_images(images)
if not valid_images(images):
raise ValueError(

View File

@@ -28,7 +28,7 @@ from ...image_utils import (
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
make_flat_list_of_images,
to_numpy_array,
valid_images,
validate_preprocess_arguments,
@@ -231,8 +231,7 @@ class BlipImageProcessor(BaseImageProcessor):
size = size if size is not None else self.size
size = get_size_dict(size, default_to_square=False)
images = make_list_of_images(images)
images = make_flat_list_of_images(images)
if not valid_images(images):
raise ValueError(

View File

@@ -30,7 +30,7 @@ from ...image_utils import (
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
is_valid_image,
make_flat_list_of_images,
to_numpy_array,
valid_images,
validate_preprocess_arguments,
@@ -44,29 +44,6 @@ if is_vision_available():
import PIL
def make_batched_images(images) -> List[List[ImageInput]]:
"""
Accepts images in list or nested list format, and makes a list of images for preprocessing.
Args:
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
The input image.
Returns:
list: A list of images.
"""
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
return [img for img_list in images for img in img_list]
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
return images
elif is_valid_image(images):
return [images]
raise ValueError(f"Could not make batched video from {images}")
class ChameleonImageProcessor(BaseImageProcessor):
r"""
Constructs a Chameleon image processor.
@@ -275,7 +252,7 @@ class ChameleonImageProcessor(BaseImageProcessor):
image_std = image_std if image_std is not None else self.image_std
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
images = make_batched_images(images)
images = make_flat_list_of_images(images)
if not valid_images(images):
raise ValueError(

View File

@@ -33,7 +33,7 @@ from ...image_utils import (
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
make_flat_list_of_images,
to_numpy_array,
valid_images,
validate_kwargs,
@@ -283,7 +283,7 @@ class CLIPImageProcessor(BaseImageProcessor):
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
images = make_list_of_images(images)
images = make_flat_list_of_images(images)
if not valid_images(images):
raise ValueError(

View File

@@ -20,11 +20,10 @@ from transformers.models.paligemma.processing_paligemma import (
IMAGE_TOKEN,
PaliGemmaProcessor,
build_string_from_input,
make_batched_images,
)
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput, is_valid_image
from ...image_utils import ImageInput, is_valid_image, make_flat_list_of_images
from ...processing_utils import (
ProcessingKwargs,
Unpack,
@@ -168,7 +167,7 @@ class ColPaliProcessor(PaliGemmaProcessor):
)
for prompt, image_list in zip(texts_doc, images)
]
images = make_batched_images(images)
images = make_flat_list_of_images(images)
pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
# max_length has to account for the image tokens

View File

@@ -23,7 +23,7 @@
from typing import ClassVar, List, Optional, Union
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput, is_valid_image
from ...image_utils import ImageInput, is_valid_image, make_flat_list_of_images
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput
from ...utils import is_torch_available
@@ -72,29 +72,6 @@ def build_string_from_input(prompt, bos_token, image_seq_len, image_token, num_i
return f"{image_token * image_seq_len * num_images}{bos_token}{prompt}\n"
def make_batched_images(images) -> List[List[ImageInput]]:
"""
Accepts images in list or nested list format, and makes a list of images for preprocessing.
Args:
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
The input image.
Returns:
list: A list of images.
"""
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
return [img for img_list in images for img in img_list]
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
return images
elif is_valid_image(images):
return [images]
raise ValueError(f"Could not make batched video from {images}")
class ColPaliProcessor(ProcessorMixin):
r"""
Constructs a ColPali processor which wraps a PaliGemmaProcessor and special methods to process images and queries, as
@@ -230,7 +207,7 @@ class ColPaliProcessor(ProcessorMixin):
)
for prompt, image_list in zip(texts_doc, images)
]
images = make_batched_images(images)
images = make_flat_list_of_images(images)
pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
# max_length has to account for the image tokens

View File

@@ -29,7 +29,7 @@ from ...image_utils import (
get_image_size,
infer_channel_dimension_format,
is_scaled_image,
is_valid_image,
make_nested_list_of_images,
to_numpy_array,
valid_images,
validate_preprocess_arguments,
@@ -77,39 +77,6 @@ def get_resize_output_image_size(image, size, input_data_format) -> Tuple[int, i
return height, width
def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]:
"""
Convert a single image or a list of images to a list of numpy arrays.
Args:
images (`ImageInput`):
A single image or a list of images.
Returns:
A list of numpy arrays.
"""
# If it's a single image, convert it to a list of lists
if is_valid_image(images):
images = [[images]]
# If it's a list of images, it's a single batch, so convert it to a list of lists
elif isinstance(images, (list, tuple)) and len(images) > 0 and is_valid_image(images[0]):
images = [images]
# If it's a list of batches, it's already in the right format
elif (
isinstance(images, (list, tuple))
and len(images) > 0
and isinstance(images[0], (list, tuple))
and len(images[0]) > 0
and is_valid_image(images[0][0])
):
pass
else:
raise ValueError(
"Invalid input type. Must be a single image, a list of images, or a list of batches of images."
)
return images
# Copied from transformers.models.detr.image_processing_detr.max_across_indices
def max_across_indices(values: Iterable[Any]) -> List[Any]:
"""
@@ -504,7 +471,7 @@ class Idefics2ImageProcessor(BaseImageProcessor):
do_pad = do_pad if do_pad is not None else self.do_pad
do_image_splitting = do_image_splitting if do_image_splitting is not None else self.do_image_splitting
images_list = make_list_of_images(images)
images_list = make_nested_list_of_images(images)
if not valid_images(images_list[0]):
raise ValueError(

View File

@@ -29,7 +29,7 @@ from ...image_utils import (
get_image_size,
infer_channel_dimension_format,
is_scaled_image,
is_valid_image,
make_nested_list_of_images,
to_numpy_array,
valid_images,
validate_preprocess_arguments,
@@ -141,40 +141,6 @@ def get_resize_output_image_size(
return height, width
# Copied from transformers.models.idefics2.image_processing_idefics2.make_list_of_images
def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]:
"""
Convert a single image or a list of images to a list of numpy arrays.
Args:
images (`ImageInput`):
A single image or a list of images.
Returns:
A list of numpy arrays.
"""
# If it's a single image, convert it to a list of lists
if is_valid_image(images):
images = [[images]]
# If it's a list of images, it's a single batch, so convert it to a list of lists
elif isinstance(images, (list, tuple)) and len(images) > 0 and is_valid_image(images[0]):
images = [images]
# If it's a list of batches, it's already in the right format
elif (
isinstance(images, (list, tuple))
and len(images) > 0
and isinstance(images[0], (list, tuple))
and len(images[0]) > 0
and is_valid_image(images[0][0])
):
pass
else:
raise ValueError(
"Invalid input type. Must be a single image, a list of images, or a list of batches of images."
)
return images
# Copied from transformers.models.detr.image_processing_detr.max_across_indices
def max_across_indices(values: Iterable[Any]) -> List[Any]:
"""
@@ -720,7 +686,7 @@ class Idefics3ImageProcessor(BaseImageProcessor):
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
do_pad = do_pad if do_pad is not None else self.do_pad
images_list = make_list_of_images(images)
images_list = make_nested_list_of_images(images)
if not valid_images(images_list[0]):
raise ValueError(

View File

@@ -32,40 +32,17 @@ from ...image_utils import (
VideoInput,
infer_channel_dimension_format,
is_scaled_image,
is_valid_image,
make_batched_videos,
to_numpy_array,
valid_images,
validate_preprocess_arguments,
)
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
if is_vision_available():
import PIL
from ...utils import TensorType, filter_out_non_signature_kwargs, logging
logger = logging.get_logger(__name__)
def make_batched_videos(videos) -> List[VideoInput]:
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
return videos
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
if isinstance(videos[0], PIL.Image.Image):
return [videos]
elif len(videos[0].shape) == 4:
return [list(video) for video in videos]
elif is_valid_image(videos):
if isinstance(videos, PIL.Image.Image):
return [[videos]]
elif len(videos.shape) == 4:
return [list(videos)]
raise ValueError(f"Could not make batched video from {videos}")
# Copied from transformers.models.blip.image_processing_blip.BlipImageProcessor with Blip->InstructBlipVideo, BLIP->InstructBLIPVideo
class InstructBlipVideoImageProcessor(BaseImageProcessor):
r"""
@@ -198,7 +175,7 @@ class InstructBlipVideoImageProcessor(BaseImageProcessor):
do_convert_rgb: bool = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> PIL.Image.Image:
) -> BatchFeature:
"""
Preprocess a video or batch of images/videos.

View File

@@ -37,7 +37,7 @@ from ...image_utils import (
get_image_size,
infer_channel_dimension_format,
is_scaled_image,
is_valid_image,
make_flat_list_of_images,
make_list_of_images,
to_numpy_array,
valid_images,
@@ -53,29 +53,6 @@ if is_vision_available():
from PIL import Image
def make_batched_images(images) -> List[List[ImageInput]]:
"""
Accepts images in list or nested list format, and makes a list of images for preprocessing.
Args:
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
The input image.
Returns:
list: A list of images.
"""
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
return [img for img_list in images for img in img_list]
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
return images
elif is_valid_image(images):
return [images]
raise ValueError(f"Could not make batched video from {images}")
def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]:
"""
Divides an image into patches of a specified size.
@@ -670,7 +647,7 @@ class LlavaNextImageProcessor(BaseImageProcessor):
do_pad = do_pad if do_pad is not None else self.do_pad
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
images = make_batched_images(images)
images = make_flat_list_of_images(images)
if not valid_images(images):
raise ValueError(

View File

@@ -34,37 +34,17 @@ from ...image_utils import (
VideoInput,
infer_channel_dimension_format,
is_scaled_image,
is_valid_image,
make_batched_videos,
make_list_of_images,
to_numpy_array,
validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging
from ...utils import TensorType, logging
logger = logging.get_logger(__name__)
if is_vision_available():
from PIL import Image
def make_batched_videos(videos) -> List[VideoInput]:
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
return videos
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
if isinstance(videos[0], Image.Image):
return [videos]
elif len(videos[0].shape) == 4:
return [list(video) for video in videos]
elif is_valid_image(videos) and len(videos.shape) == 4:
return [list(videos)]
raise ValueError(f"Could not make batched video from {videos}")
class LlavaNextVideoImageProcessor(BaseImageProcessor):
r"""
Constructs a LLaVa-NeXT-Video video processor. Based on [`CLIPImageProcessor`] with incorporation of processing each video frame.
@@ -212,7 +192,7 @@ class LlavaNextVideoImageProcessor(BaseImageProcessor):
do_convert_rgb: bool = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Image.Image:
) -> list[np.ndarray]:
"""
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.

View File

@@ -36,7 +36,7 @@ from ...image_utils import (
get_image_size,
infer_channel_dimension_format,
is_scaled_image,
is_valid_image,
make_flat_list_of_images,
to_numpy_array,
valid_images,
validate_preprocess_arguments,
@@ -51,30 +51,6 @@ if is_vision_available():
from PIL import Image
# Copied from transformers.models.llava_next.image_processing_llava_next.make_batched_images
def make_batched_images(images) -> List[List[ImageInput]]:
"""
Accepts images in list or nested list format, and makes a list of images for preprocessing.
Args:
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
The input image.
Returns:
list: A list of images.
"""
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
return [img for img_list in images for img in img_list]
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
return images
elif is_valid_image(images):
return [images]
raise ValueError(f"Could not make batched video from {images}")
# Copied from transformers.models.llava_next.image_processing_llava_next.divide_to_patches
def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]:
"""
@@ -632,7 +608,7 @@ class LlavaOnevisionImageProcessor(BaseImageProcessor):
do_pad = do_pad if do_pad is not None else self.do_pad
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
images = make_batched_images(images)
images = make_flat_list_of_images(images)
if not valid_images(images):
raise ValueError(

View File

@@ -16,6 +16,8 @@
from typing import Dict, List, Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
convert_to_rgb,
@@ -31,37 +33,17 @@ from ...image_utils import (
VideoInput,
infer_channel_dimension_format,
is_scaled_image,
is_valid_image,
make_batched_videos,
to_numpy_array,
valid_images,
validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging
from ...utils import TensorType, logging
logger = logging.get_logger(__name__)
if is_vision_available():
from PIL import Image
def make_batched_videos(videos) -> List[VideoInput]:
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
return videos
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
if isinstance(videos[0], Image.Image) or len(videos[0].shape) == 3:
return [videos]
elif len(videos[0].shape) == 4:
return [list(video) for video in videos]
elif is_valid_image(videos) and len(videos.shape) == 4:
return [list(videos)]
raise ValueError(f"Could not make batched video from {videos}")
class LlavaOnevisionVideoProcessor(BaseImageProcessor):
r"""
Constructs a LLaVa-Onevisino-Video video processor. Based on [`SiglipImageProcessor`] with incorporation of processing each video frame.
@@ -138,7 +120,7 @@ class LlavaOnevisionVideoProcessor(BaseImageProcessor):
do_convert_rgb: bool = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Image.Image:
) -> list[np.ndarray]:
"""
Args:
images (`ImageInput`):

View File

@@ -33,8 +33,8 @@ from ...image_utils import (
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_valid_image,
is_vision_available,
make_nested_list_of_images,
to_numpy_array,
validate_preprocess_arguments,
)
@@ -514,42 +514,6 @@ def convert_to_rgb(image: ImageInput) -> ImageInput:
return alpha_composite
# Modified from transformers.models.idefics2.image_processing_idefics2.make_list_of_images
def make_list_of_images(images: ImageInput) -> List[List[Optional[np.ndarray]]]:
"""
Convert a single image or a list of images to a list of numpy arrays.
Args:
images (`ImageInput`):
A single image or a list of images.
Returns:
A list of numpy arrays.
"""
# If it's a single image, convert it to a list of lists
if is_valid_image(images):
output_images = [[images]]
# If it's a list of images, it's a single batch, so convert it to a list of lists
elif isinstance(images, (list, tuple)) and is_valid_list_of_images(images):
output_images = [images]
# If it's a list of batches, it's already in the right format
elif (
isinstance(images, (list, tuple))
and all(isinstance(images_i, (list, tuple)) for images_i in images)
and any(is_valid_list_of_images(images_i) for images_i in images)
):
output_images = images
else:
raise ValueError(
"Invalid input type. Must be a single image, a list of images, or a list of batches of images."
)
return output_images
def is_valid_list_of_images(images: List):
return images and all(is_valid_image(image) for image in images)
def _validate_size(size: Dict[str, int]) -> None:
if not ("height" in size and "width" in size):
raise ValueError(f"Argument `size` must be a dictionary with keys 'height' and 'width'. Got: {size}")
@@ -726,7 +690,7 @@ class MllamaImageProcessor(BaseImageProcessor):
# extra validation
_validate_mllama_preprocess_arguments(do_resize, size, do_pad, max_image_tiles)
images_list = make_list_of_images(images)
images_list = make_nested_list_of_images(images)
if self.do_convert_rgb:
images_list = [[convert_to_rgb(image) for image in images] for images in images_list]

View File

@@ -20,16 +20,13 @@ from typing import List, Optional, Union
import numpy as np
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput
from ...image_utils import ImageInput, make_nested_list_of_images
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import (
PreTokenizedInput,
TextInput,
)
# TODO: Can we do it that way or its better include as "Copied from ..."
from .image_processing_mllama import make_list_of_images
class MllamaImagesKwargs(ImagesKwargs, total=False):
max_image_tiles: Optional[int]
@@ -292,7 +289,7 @@ class MllamaProcessor(ProcessorMixin):
n_images_in_images = [0]
if images is not None:
images = make_list_of_images(images)
images = make_nested_list_of_images(images)
n_images_in_images = [len(sample) for sample in images]
if text is not None:

View File

@@ -19,7 +19,7 @@ Processor class for PaliGemma.
from typing import List, Optional, Union
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput, is_valid_image
from ...image_utils import ImageInput, is_valid_image, make_flat_list_of_images
from ...processing_utils import (
ImagesKwargs,
ProcessingKwargs,
@@ -99,30 +99,6 @@ def build_string_from_input(prompt, bos_token, image_seq_len, image_token, num_i
return f"{image_token * image_seq_len * num_images}{bos_token}{prompt}\n"
# Copied from transformers.models.llava_next.image_processing_llava_next.make_batched_images
def make_batched_images(images) -> List[List[ImageInput]]:
"""
Accepts images in list or nested list format, and makes a list of images for preprocessing.
Args:
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
The input image.
Returns:
list: A list of images.
"""
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
return [img for img_list in images for img in img_list]
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
return images
elif is_valid_image(images):
return [images]
raise ValueError(f"Could not make batched video from {images}")
class PaliGemmaProcessor(ProcessorMixin):
r"""
Constructs a PaliGemma processor which wraps a PaliGemma image processor and a PaliGemma tokenizer into a single processor.
@@ -297,7 +273,7 @@ class PaliGemmaProcessor(ProcessorMixin):
)
for prompt, image_list in zip(text, images)
]
images = make_batched_images(images)
images = make_flat_list_of_images(images)
else:
expanded_samples = []
for sample in text:

View File

@@ -41,61 +41,19 @@ from ...image_utils import (
get_image_size,
infer_channel_dimension_format,
is_scaled_image,
is_valid_image,
make_batched_videos,
make_flat_list_of_images,
make_list_of_images,
to_numpy_array,
valid_images,
validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging
if is_vision_available():
from PIL import Image
from ...utils import TensorType, logging
logger = logging.get_logger(__name__)
def make_batched_images(images) -> List[List[ImageInput]]:
"""
Accepts images in list or nested list format, and makes a list of images for preprocessing.
Args:
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
The input image.
Returns:
list: A list of images.
"""
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
return [img for img_list in images for img in img_list]
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
return images
elif is_valid_image(images):
return [images]
raise ValueError(f"Could not make batched images from {images}")
def make_batched_videos(videos) -> List[VideoInput]:
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
return videos
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
if isinstance(videos[0], Image.Image):
return [videos]
elif len(videos[0].shape) == 4:
return [list(video) for video in videos]
elif is_valid_image(videos) and len(videos.shape) == 4:
return [list(videos)]
raise ValueError(f"Could not make batched video from {videos}")
def smart_resize(
height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
):
@@ -398,7 +356,7 @@ class Qwen2_5_VLImageProcessor(BaseImageProcessor):
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
if images is not None:
images = make_batched_images(images)
images = make_flat_list_of_images(images)
if videos is not None:
videos = make_batched_videos(videos)

View File

@@ -40,62 +40,19 @@ from ...image_utils import (
get_image_size,
infer_channel_dimension_format,
is_scaled_image,
is_valid_image,
make_batched_videos,
make_flat_list_of_images,
make_list_of_images,
to_numpy_array,
valid_images,
validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging
from ...utils import TensorType, logging
logger = logging.get_logger(__name__)
if is_vision_available():
from PIL import Image
def make_batched_images(images) -> List[List[ImageInput]]:
"""
Accepts images in list or nested list format, and makes a list of images for preprocessing.
Args:
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
The input image.
Returns:
list: A list of images.
"""
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
return [img for img_list in images for img in img_list]
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
return images
elif is_valid_image(images):
return [images]
raise ValueError(f"Could not make batched images from {images}")
# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
def make_batched_videos(videos) -> List[VideoInput]:
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
return videos
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
if isinstance(videos[0], Image.Image):
return [videos]
elif len(videos[0].shape) == 4:
return [list(video) for video in videos]
elif is_valid_image(videos) and len(videos.shape) == 4:
return [list(videos)]
raise ValueError(f"Could not make batched video from {videos}")
def smart_resize(
height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
):
@@ -392,7 +349,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
if images is not None:
images = make_batched_images(images)
images = make_flat_list_of_images(images)
if videos is not None:
videos = make_batched_videos(videos)

View File

@@ -39,6 +39,8 @@ from ...image_utils import (
get_image_size,
get_image_type,
infer_channel_dimension_format,
make_batched_videos,
make_flat_list_of_images,
make_list_of_images,
valid_images,
validate_preprocess_arguments,
@@ -51,7 +53,7 @@ from ...utils import (
is_vision_available,
logging,
)
from .image_processing_qwen2_vl import make_batched_images, make_batched_videos, smart_resize
from .image_processing_qwen2_vl import smart_resize
if is_torch_available():
@@ -350,7 +352,7 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
image_std = tuple(image_std) if isinstance(image_std, list) else image_std
if images is not None:
images = make_batched_images(images)
images = make_flat_list_of_images(images)
if videos is not None:
videos = make_batched_videos(videos)

View File

@@ -30,7 +30,7 @@ from ...image_utils import (
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
make_flat_list_of_images,
to_numpy_array,
valid_images,
validate_preprocess_arguments,
@@ -181,7 +181,7 @@ class SiglipImageProcessor(BaseImageProcessor):
image_std = image_std if image_std is not None else self.image_std
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
images = make_list_of_images(images)
images = make_flat_list_of_images(images)
if not valid_images(images):
raise ValueError(

View File

@@ -34,38 +34,18 @@ from ...image_utils import (
VideoInput,
infer_channel_dimension_format,
is_scaled_image,
is_valid_image,
make_batched_videos,
make_list_of_images,
to_numpy_array,
valid_images,
validate_preprocess_arguments,
)
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
from ...utils import TensorType, filter_out_non_signature_kwargs, logging
logger = logging.get_logger(__name__)
if is_vision_available():
import PIL
def make_batched_videos(videos) -> List[VideoInput]:
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
return videos
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
if isinstance(videos[0], PIL.Image.Image):
return [videos]
elif len(videos[0].shape) == 4:
return [list(video) for video in videos]
elif is_valid_image(videos) and len(videos.shape) == 4:
return [list(videos)]
raise ValueError(f"Could not make batched video from {videos}")
class VideoLlavaImageProcessor(BaseImageProcessor):
r"""
Constructs a CLIP image processor.
@@ -208,7 +188,7 @@ class VideoLlavaImageProcessor(BaseImageProcessor):
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> PIL.Image.Image:
) -> BatchFeature:
"""
Preprocess an image or batch of images.

View File

@@ -26,7 +26,6 @@ from tests.test_configuration_common import ConfigTester
from tests.test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
from transformers import (
is_torch_available,
is_vision_available,
)
from transformers.models.colpali.configuration_colpali import ColPaliConfig
from transformers.models.colpali.modeling_colpali import ColPaliForRetrieval, ColPaliForRetrievalOutput
@@ -43,9 +42,6 @@ from transformers.testing_utils import (
if is_torch_available():
import torch
if is_vision_available():
pass
class ColPaliForRetrievalModelTester:
def __init__(

View File

@@ -39,7 +39,7 @@ from transformers.testing_utils import (
slow,
torch_device,
)
from transformers.utils import is_torch_available, is_vision_available
from transformers.utils import is_torch_available
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
@@ -58,10 +58,6 @@ if is_torch_available():
from transformers import InstructBlipVideoForConditionalGeneration, InstructBlipVideoVisionModel
if is_vision_available():
pass
class InstructBlipVideoVisionModelTester:
def __init__(
self,

View File

@@ -20,7 +20,6 @@ from transformers import (
PixtralVisionConfig,
PixtralVisionModel,
is_torch_available,
is_vision_available,
)
from transformers.testing_utils import (
require_torch,
@@ -35,10 +34,6 @@ if is_torch_available():
import torch
if is_vision_available():
pass
class PixtralVisionModelTester:
def __init__(
self,

View File

@@ -28,7 +28,14 @@ from requests import ConnectTimeout, ReadTimeout
from tests.pipelines.test_pipelines_document_question_answering import INVOICE_URL
from transformers import is_torch_available, is_vision_available
from transformers.image_utils import ChannelDimension, get_channel_dimension_axis, make_list_of_images
from transformers.image_utils import (
ChannelDimension,
get_channel_dimension_axis,
make_batched_videos,
make_flat_list_of_images,
make_list_of_images,
make_nested_list_of_images,
)
from transformers.testing_utils import is_flaky, require_torch, require_vision
@@ -115,6 +122,21 @@ class ImageFeatureExtractionTester(unittest.TestCase):
self.assertEqual(array5.shape, (3, 16, 32))
self.assertTrue(np.array_equal(array5, array1))
def test_make_list_of_images_pil(self):
# Test a single image is converted to a list of 1 image
pil_image = get_random_image(16, 32)
images_list = make_list_of_images(pil_image)
self.assertIsInstance(images_list, list)
self.assertEqual(len(images_list), 1)
self.assertIsInstance(images_list[0], PIL.Image.Image)
# Test a list of images is not modified
images = [get_random_image(16, 32) for _ in range(4)]
images_list = make_list_of_images(images)
self.assertIsInstance(images_list, list)
self.assertEqual(len(images_list), 4)
self.assertIsInstance(images_list[0], PIL.Image.Image)
def test_make_list_of_images_numpy(self):
# Test a single image is converted to a list of 1 image
images = np.random.randint(0, 256, (16, 32, 3))
@@ -167,6 +189,323 @@ class ImageFeatureExtractionTester(unittest.TestCase):
self.assertTrue(np.array_equal(images_list[0], images[0]))
self.assertIsInstance(images_list, list)
def test_make_flat_list_of_images_pil(self):
# Test a single image is converted to a list of 1 image
pil_image = get_random_image(16, 32)
images_list = make_flat_list_of_images(pil_image)
self.assertIsInstance(images_list, list)
self.assertEqual(len(images_list), 1)
self.assertIsInstance(images_list[0], PIL.Image.Image)
# Test a list of images is not modified
images = [get_random_image(16, 32) for _ in range(4)]
images_list = make_flat_list_of_images(images)
self.assertIsInstance(images_list, list)
self.assertEqual(len(images_list), 4)
self.assertIsInstance(images_list[0], PIL.Image.Image)
# Test a nested list of images is flattened
images = [[get_random_image(16, 32) for _ in range(2)] for _ in range(2)]
images_list = make_flat_list_of_images(images)
self.assertIsInstance(images_list, list)
self.assertEqual(len(images_list), 4)
self.assertIsInstance(images_list[0], PIL.Image.Image)
def test_make_flat_list_of_images_numpy(self):
# Test a single image is converted to a list of 1 image
images = np.random.randint(0, 256, (16, 32, 3))
images_list = make_flat_list_of_images(images)
self.assertEqual(len(images_list), 1)
self.assertTrue(np.array_equal(images_list[0], images))
self.assertIsInstance(images_list, list)
# Test a 4d array of images is changed to a list of images
images = np.random.randint(0, 256, (4, 16, 32, 3))
images_list = make_flat_list_of_images(images)
self.assertEqual(len(images_list), 4)
self.assertIsInstance(images_list, list)
self.assertIsInstance(images_list[0], np.ndarray)
self.assertTrue(np.array_equal(images_list[0], images[0]))
# Test a list of images is not modified
images = [np.random.randint(0, 256, (16, 32, 3)) for _ in range(4)]
images_list = make_flat_list_of_images(images)
self.assertEqual(len(images_list), 4)
self.assertTrue(np.array_equal(images_list[0], images[0]))
self.assertIsInstance(images_list, list)
# Test list of 4d array images is flattened
images = [np.random.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)]
images_list = make_flat_list_of_images(images)
self.assertEqual(len(images_list), 8)
self.assertTrue(np.array_equal(images_list[0], images[0][0]))
self.assertIsInstance(images_list, list)
self.assertIsInstance(images_list[0], np.ndarray)
# Test nested list of images is flattened
images = [[np.random.randint(0, 256, (16, 32, 3)) for _ in range(2)] for _ in range(2)]
images_list = make_flat_list_of_images(images)
self.assertEqual(len(images_list), 4)
self.assertTrue(np.array_equal(images_list[0], images[0][0]))
self.assertIsInstance(images_list, list)
@require_torch
def test_make_flat_list_of_images_torch(self):
# Test a single image is converted to a list of 1 image
images = torch.randint(0, 256, (16, 32, 3))
images_list = make_flat_list_of_images(images)
self.assertEqual(len(images_list), 1)
self.assertTrue(np.array_equal(images_list[0], images))
self.assertIsInstance(images_list, list)
# Test a 4d tensors of images is changed to a list of images
images = torch.randint(0, 256, (4, 16, 32, 3))
images_list = make_flat_list_of_images(images)
self.assertEqual(len(images_list), 4)
self.assertIsInstance(images_list, list)
self.assertIsInstance(images_list[0], torch.Tensor)
self.assertTrue(np.array_equal(images_list[0], images[0]))
# Test a list of images is not modified
images = [torch.randint(0, 256, (16, 32, 3)) for _ in range(4)]
images_list = make_flat_list_of_images(images)
self.assertEqual(len(images_list), 4)
self.assertTrue(np.array_equal(images_list[0], images[0]))
self.assertIsInstance(images_list, list)
# Test list of 4d tensors of imagess is flattened
images = [torch.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)]
images_list = make_flat_list_of_images(images)
self.assertEqual(len(images_list), 8)
self.assertTrue(np.array_equal(images_list[0], images[0][0]))
self.assertIsInstance(images_list, list)
self.assertIsInstance(images_list[0], torch.Tensor)
# Test nested list of images is flattened
images = [[torch.randint(0, 256, (16, 32, 3)) for _ in range(2)] for _ in range(2)]
images_list = make_flat_list_of_images(images)
self.assertEqual(len(images_list), 4)
self.assertTrue(np.array_equal(images_list[0], images[0][0]))
self.assertIsInstance(images_list, list)
def test_make_nested_list_of_images_pil(self):
# Test a single image is converted to a nested list of 1 image
pil_image = get_random_image(16, 32)
images_list = make_nested_list_of_images(pil_image)
self.assertIsInstance(images_list[0], list)
self.assertEqual(len(images_list[0]), 1)
self.assertIsInstance(images_list[0][0], PIL.Image.Image)
# Test a list of images is converted to a nested list of images
images = [get_random_image(16, 32) for _ in range(4)]
images_list = make_nested_list_of_images(images)
self.assertIsInstance(images_list[0], list)
self.assertEqual(len(images_list), 1)
self.assertEqual(len(images_list[0]), 4)
self.assertIsInstance(images_list[0][0], PIL.Image.Image)
# Test a nested list of images is not modified
images = [[get_random_image(16, 32) for _ in range(2)] for _ in range(2)]
images_list = make_nested_list_of_images(images)
self.assertIsInstance(images_list[0], list)
self.assertEqual(len(images_list), 2)
self.assertEqual(len(images_list[0]), 2)
self.assertIsInstance(images_list[0][0], PIL.Image.Image)
def test_make_nested_list_of_images_numpy(self):
# Test a single image is converted to a nested list of 1 image
images = np.random.randint(0, 256, (16, 32, 3))
images_list = make_nested_list_of_images(images)
self.assertIsInstance(images_list[0], list)
self.assertEqual(len(images_list), 1)
self.assertTrue(np.array_equal(images_list[0][0], images))
# Test a 4d array of images is converted to a nested list of images
images = np.random.randint(0, 256, (4, 16, 32, 3))
images_list = make_nested_list_of_images(images)
self.assertIsInstance(images_list[0], list)
self.assertIsInstance(images_list[0][0], np.ndarray)
self.assertEqual(len(images_list), 1)
self.assertEqual(len(images_list[0]), 4)
self.assertTrue(np.array_equal(images_list[0][0], images[0]))
# Test a list of images is converted to a nested list of images
images = [np.random.randint(0, 256, (16, 32, 3)) for _ in range(4)]
images_list = make_nested_list_of_images(images)
self.assertIsInstance(images_list[0], list)
self.assertEqual(len(images_list), 1)
self.assertEqual(len(images_list[0]), 4)
self.assertTrue(np.array_equal(images_list[0][0], images[0]))
# Test a nested list of images is left unchanged
images = [[np.random.randint(0, 256, (16, 32, 3)) for _ in range(2)] for _ in range(2)]
images_list = make_nested_list_of_images(images)
self.assertIsInstance(images_list[0], list)
self.assertEqual(len(images_list), 2)
self.assertEqual(len(images_list[0]), 2)
self.assertTrue(np.array_equal(images_list[0][0], images[0][0]))
# Test a list of 4d array images is converted to a nested list of images
images = [np.random.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)]
images_list = make_nested_list_of_images(images)
self.assertIsInstance(images_list[0], list)
self.assertIsInstance(images_list[0][0], np.ndarray)
self.assertEqual(len(images_list), 2)
self.assertEqual(len(images_list[0]), 4)
self.assertTrue(np.array_equal(images_list[0][0], images[0][0]))
@require_torch
def test_make_nested_list_of_images_torch(self):
# Test a single image is converted to a nested list of 1 image
images = torch.randint(0, 256, (16, 32, 3))
images_list = make_nested_list_of_images(images)
self.assertIsInstance(images_list[0], list)
self.assertEqual(len(images_list[0]), 1)
self.assertTrue(np.array_equal(images_list[0][0], images))
# Test a 4d tensor of images is converted to a nested list of images
images = torch.randint(0, 256, (4, 16, 32, 3))
images_list = make_nested_list_of_images(images)
self.assertIsInstance(images_list[0], list)
self.assertIsInstance(images_list[0][0], torch.Tensor)
self.assertEqual(len(images_list), 1)
self.assertEqual(len(images_list[0]), 4)
self.assertTrue(np.array_equal(images_list[0][0], images[0]))
# Test a list of images is converted to a nested list of images
images = [torch.randint(0, 256, (16, 32, 3)) for _ in range(4)]
images_list = make_nested_list_of_images(images)
self.assertIsInstance(images_list[0], list)
self.assertEqual(len(images_list), 1)
self.assertEqual(len(images_list[0]), 4)
self.assertTrue(np.array_equal(images_list[0][0], images[0]))
# Test a nested list of images is left unchanged
images = [[torch.randint(0, 256, (16, 32, 3)) for _ in range(2)] for _ in range(2)]
images_list = make_nested_list_of_images(images)
self.assertIsInstance(images_list[0], list)
self.assertEqual(len(images_list), 2)
self.assertEqual(len(images_list[0]), 2)
self.assertTrue(np.array_equal(images_list[0][0], images[0][0]))
# Test a list of 4d tensor images is converted to a nested list of images
images = [torch.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)]
images_list = make_nested_list_of_images(images)
self.assertIsInstance(images_list[0], list)
self.assertIsInstance(images_list[0][0], torch.Tensor)
self.assertEqual(len(images_list), 2)
self.assertEqual(len(images_list[0]), 4)
self.assertTrue(np.array_equal(images_list[0][0], images[0][0]))
def test_make_batched_videos_pil(self):
# Test a single image is converted to a list of 1 video with 1 frame
pil_image = get_random_image(16, 32)
videos_list = make_batched_videos(pil_image)
self.assertIsInstance(videos_list[0], list)
self.assertEqual(len(videos_list[0]), 1)
self.assertIsInstance(videos_list[0][0], PIL.Image.Image)
# Test a list of images is converted to a list of 1 video
images = [get_random_image(16, 32) for _ in range(4)]
videos_list = make_batched_videos(images)
self.assertIsInstance(videos_list[0], list)
self.assertEqual(len(videos_list), 1)
self.assertEqual(len(videos_list[0]), 4)
self.assertIsInstance(videos_list[0][0], PIL.Image.Image)
# Test a nested list of images is not modified
images = [[get_random_image(16, 32) for _ in range(2)] for _ in range(2)]
videos_list = make_nested_list_of_images(images)
self.assertIsInstance(videos_list[0], list)
self.assertEqual(len(videos_list), 2)
self.assertEqual(len(videos_list[0]), 2)
self.assertIsInstance(videos_list[0][0], PIL.Image.Image)
def test_make_batched_videos_numpy(self):
# Test a single image is converted to a list of 1 video with 1 frame
images = np.random.randint(0, 256, (16, 32, 3))
videos_list = make_nested_list_of_images(images)
self.assertIsInstance(videos_list[0], list)
self.assertEqual(len(videos_list), 1)
self.assertTrue(np.array_equal(videos_list[0][0], images))
# Test a 4d array of images is converted to a a list of 1 video
images = np.random.randint(0, 256, (4, 16, 32, 3))
videos_list = make_nested_list_of_images(images)
self.assertIsInstance(videos_list[0], list)
self.assertIsInstance(videos_list[0][0], np.ndarray)
self.assertEqual(len(videos_list), 1)
self.assertEqual(len(videos_list[0]), 4)
self.assertTrue(np.array_equal(videos_list[0][0], images[0]))
# Test a list of images is converted to a list of videos
images = [np.random.randint(0, 256, (16, 32, 3)) for _ in range(4)]
videos_list = make_nested_list_of_images(images)
self.assertIsInstance(videos_list[0], list)
self.assertEqual(len(videos_list), 1)
self.assertEqual(len(videos_list[0]), 4)
self.assertTrue(np.array_equal(videos_list[0][0], images[0]))
# Test a nested list of images is left unchanged
images = [[np.random.randint(0, 256, (16, 32, 3)) for _ in range(2)] for _ in range(2)]
videos_list = make_nested_list_of_images(images)
self.assertIsInstance(videos_list[0], list)
self.assertEqual(len(videos_list), 2)
self.assertEqual(len(videos_list[0]), 2)
self.assertTrue(np.array_equal(videos_list[0][0], images[0][0]))
# Test a list of 4d array images is converted to a list of videos
images = [np.random.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)]
videos_list = make_nested_list_of_images(images)
self.assertIsInstance(videos_list[0], list)
self.assertIsInstance(videos_list[0][0], np.ndarray)
self.assertEqual(len(videos_list), 2)
self.assertEqual(len(videos_list[0]), 4)
self.assertTrue(np.array_equal(videos_list[0][0], images[0][0]))
@require_torch
def test_make_batched_videos_torch(self):
# Test a single image is converted to a list of 1 video with 1 frame
images = torch.randint(0, 256, (16, 32, 3))
videos_list = make_nested_list_of_images(images)
self.assertIsInstance(videos_list[0], list)
self.assertEqual(len(videos_list[0]), 1)
self.assertTrue(np.array_equal(videos_list[0][0], images))
# Test a 4d tensor of images is converted to a list of 1 video
images = torch.randint(0, 256, (4, 16, 32, 3))
videos_list = make_nested_list_of_images(images)
self.assertIsInstance(videos_list[0], list)
self.assertIsInstance(videos_list[0][0], torch.Tensor)
self.assertEqual(len(videos_list), 1)
self.assertEqual(len(videos_list[0]), 4)
self.assertTrue(np.array_equal(videos_list[0][0], images[0]))
# Test a list of images is converted to a list of videos
images = [torch.randint(0, 256, (16, 32, 3)) for _ in range(4)]
videos_list = make_nested_list_of_images(images)
self.assertIsInstance(videos_list[0], list)
self.assertEqual(len(videos_list), 1)
self.assertEqual(len(videos_list[0]), 4)
self.assertTrue(np.array_equal(videos_list[0][0], images[0]))
# Test a nested list of images is left unchanged
images = [[torch.randint(0, 256, (16, 32, 3)) for _ in range(2)] for _ in range(2)]
videos_list = make_nested_list_of_images(images)
self.assertIsInstance(videos_list[0], list)
self.assertEqual(len(videos_list), 2)
self.assertEqual(len(videos_list[0]), 2)
self.assertTrue(np.array_equal(videos_list[0][0], images[0][0]))
# Test a list of 4d tensor images is converted to a list of videos
images = [torch.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)]
videos_list = make_nested_list_of_images(images)
self.assertIsInstance(videos_list[0], list)
self.assertIsInstance(videos_list[0][0], torch.Tensor)
self.assertEqual(len(videos_list), 2)
self.assertEqual(len(videos_list[0]), 4)
self.assertTrue(np.array_equal(videos_list[0][0], images[0][0]))
@require_torch
def test_conversion_torch_to_array(self):
feature_extractor = ImageFeatureExtractionMixin()