Add LlavaImageProcessor (#33191)
* First draft * Add equivalence test * Update docstrings * Add tests * Use numpy * Fix tests * Improve variable names * Improve docstring * Add link * Remove script * Add copied from * Address comment * Add note in docs * Add docstring, data format * Improve test * Add test * update * Update src/transformers/models/llava/image_processing_llava.py Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> * Update src/transformers/models/llava/image_processing_llava.py Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> * loop once only --------- Co-authored-by: raushan <raushan@huggingface.co> Co-authored-by: Raushan Turganbay <raushan.turganbay@alumni.nu.edu.kz> Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
This commit is contained in:
@@ -162,6 +162,16 @@ For multiple turns conversation:
|
||||
"USER: <image>\n<prompt1> ASSISTANT: <answer1></s>USER: <prompt2> ASSISTANT: <answer2></s>USER: <prompt3> ASSISTANT:"
|
||||
```
|
||||
|
||||
## Note regarding reproducing original implementation
|
||||
|
||||
In order to match the logits of the [original implementation](https://github.com/haotian-liu/LLaVA/tree/main), one needs to additionally specify `do_pad=True` when instantiating `LLavaImageProcessor`:
|
||||
|
||||
```python
|
||||
from transformers import LLavaImageProcessor
|
||||
|
||||
image_processor = LLavaImageProcessor.from_pretrained("https://huggingface.co/llava-hf/llava-1.5-7b-hf", do_pad=True)
|
||||
```
|
||||
|
||||
### Using Flash Attention 2
|
||||
|
||||
Flash Attention 2 is an even faster, optimized version of the previous optimization, please refer to the [Flash Attention 2 section of performance docs](https://huggingface.co/docs/transformers/perf_infer_gpu_one).
|
||||
@@ -180,6 +190,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
|
||||
|
||||
[[autodoc]] LlavaConfig
|
||||
|
||||
## LlavaImageProcessor
|
||||
|
||||
[[autodoc]] LlavaImageProcessor
|
||||
- preprocess
|
||||
|
||||
## LlavaProcessor
|
||||
|
||||
[[autodoc]] LlavaProcessor
|
||||
|
||||
@@ -1243,6 +1243,7 @@ else:
|
||||
_import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"])
|
||||
_import_structure["models.layoutlmv3"].extend(["LayoutLMv3FeatureExtractor", "LayoutLMv3ImageProcessor"])
|
||||
_import_structure["models.levit"].extend(["LevitFeatureExtractor", "LevitImageProcessor"])
|
||||
_import_structure["models.llava"].append("LlavaImageProcessor")
|
||||
_import_structure["models.llava_next"].append("LlavaNextImageProcessor")
|
||||
_import_structure["models.llava_next_video"].append("LlavaNextVideoImageProcessor")
|
||||
_import_structure["models.llava_onevision"].extend(
|
||||
@@ -6334,6 +6335,7 @@ if TYPE_CHECKING:
|
||||
LayoutLMv3ImageProcessor,
|
||||
)
|
||||
from .models.levit import LevitFeatureExtractor, LevitImageProcessor
|
||||
from .models.llava import LlavaImageProcessor
|
||||
from .models.llava_next import LlavaNextImageProcessor
|
||||
from .models.llava_next_video import LlavaNextVideoImageProcessor
|
||||
from .models.llava_onevision import LlavaOnevisionImageProcessor, LlavaOnevisionVideoProcessor
|
||||
|
||||
@@ -101,7 +101,7 @@ else:
|
||||
("layoutlmv2", ("LayoutLMv2ImageProcessor",)),
|
||||
("layoutlmv3", ("LayoutLMv3ImageProcessor",)),
|
||||
("levit", ("LevitImageProcessor",)),
|
||||
("llava", ("CLIPImageProcessor",)),
|
||||
("llava", ("LlavaImageProcessor",)),
|
||||
("llava_next", ("LlavaNextImageProcessor",)),
|
||||
("llava_next_video", ("LlavaNextVideoImageProcessor",)),
|
||||
("llava_onevision", ("LlavaOnevisionImageProcessor",)),
|
||||
|
||||
436
src/transformers/models/llava/image_processing_llava.py
Normal file
436
src/transformers/models/llava/image_processing_llava.py
Normal file
@@ -0,0 +1,436 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Image processor class for LLaVa."""
|
||||
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
|
||||
from ...image_transforms import (
|
||||
convert_to_rgb,
|
||||
get_resize_output_image_size,
|
||||
resize,
|
||||
to_channel_dimension_format,
|
||||
)
|
||||
from ...image_utils import (
|
||||
OPENAI_CLIP_MEAN,
|
||||
OPENAI_CLIP_STD,
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
get_image_size,
|
||||
infer_channel_dimension_format,
|
||||
is_scaled_image,
|
||||
make_list_of_images,
|
||||
to_numpy_array,
|
||||
valid_images,
|
||||
validate_kwargs,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...utils import TensorType, is_vision_available, logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
import PIL
|
||||
|
||||
|
||||
class LlavaImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
Constructs a LLaVa image processor.
|
||||
|
||||
Args:
|
||||
do_pad (`bool`, *optional*, defaults to `False`):
|
||||
Whether to pad the image to a square based on the longest edge.
|
||||
The padding value is determined by the `image_mean` parameter.
|
||||
Can be overridden by `do_pad` in the `preprocess` method.
|
||||
do_resize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
|
||||
`do_resize` in the `preprocess` method.
|
||||
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
|
||||
Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
|
||||
the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
|
||||
method.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
||||
Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
|
||||
do_center_crop (`bool`, *optional*, defaults to `True`):
|
||||
Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
|
||||
`preprocess` method.
|
||||
crop_size (`Dict[str, int]` *optional*, defaults to 224):
|
||||
Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
|
||||
method.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
|
||||
the `preprocess` method.
|
||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
|
||||
method.
|
||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
|
||||
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
|
||||
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
|
||||
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
|
||||
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||
Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `True`):
|
||||
Whether to convert the image to RGB.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
do_pad: bool = False,
|
||||
do_resize: bool = True,
|
||||
size: Dict[str, int] = None,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
do_center_crop: bool = True,
|
||||
crop_size: Dict[str, int] = None,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: Union[int, float] = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_rgb: bool = True,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
size = size if size is not None else {"shortest_edge": 224}
|
||||
size = get_size_dict(size, default_to_square=False)
|
||||
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
|
||||
crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
|
||||
|
||||
self.do_pad = do_pad
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.resample = resample
|
||||
self.do_center_crop = do_center_crop
|
||||
self.crop_size = crop_size
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
|
||||
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
self._valid_processor_keys = [
|
||||
"images",
|
||||
"do_pad",
|
||||
"do_resize",
|
||||
"size",
|
||||
"resample",
|
||||
"do_center_crop",
|
||||
"crop_size",
|
||||
"do_rescale",
|
||||
"rescale_factor",
|
||||
"do_normalize",
|
||||
"image_mean",
|
||||
"image_std",
|
||||
"do_convert_rgb",
|
||||
"return_tensors",
|
||||
"data_format",
|
||||
"input_data_format",
|
||||
]
|
||||
|
||||
def pad_to_square(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
background_color: Union[int, Tuple[int, int, int]] = 0,
|
||||
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
) -> np.array:
|
||||
"""
|
||||
Pads an image to a square based on the longest edge.
|
||||
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
The image to pad.
|
||||
background_color (`int` or `Tuple[int, int, int]`, *optional*, defaults to 0):
|
||||
The color to use for the padding. Can be an integer for single channel or a
|
||||
tuple of integers representing for multi-channel images. If passed as integer
|
||||
in mutli-channel mode, it will default to `0` in subsequent channels.
|
||||
data_format (`str` or `ChannelDimension`, *optional*):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
If unset, will use same as the input image.
|
||||
input_data_format (`str` or `ChannelDimension`, *optional*):
|
||||
The channel dimension format for the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
If unset, will use the inferred format of the input image.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`: The padded image.
|
||||
"""
|
||||
height, width = get_image_size(image, input_data_format)
|
||||
num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1]
|
||||
|
||||
if height == width:
|
||||
image = (
|
||||
to_channel_dimension_format(image, data_format, input_data_format)
|
||||
if data_format is not None
|
||||
else image
|
||||
)
|
||||
return image
|
||||
|
||||
max_dim = max(height, width)
|
||||
|
||||
# Ensure background_color is the correct shape
|
||||
if isinstance(background_color, int):
|
||||
background_color = [background_color]
|
||||
elif len(background_color) != num_channels:
|
||||
raise ValueError(
|
||||
f"background_color must have no more than {num_channels} elements to match the number of channels"
|
||||
)
|
||||
|
||||
if input_data_format == ChannelDimension.FIRST:
|
||||
result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype)
|
||||
for i, color in enumerate(background_color):
|
||||
result[i, :, :] = color
|
||||
if width > height:
|
||||
start = (max_dim - height) // 2
|
||||
result[:, start : start + height, :] = image
|
||||
else:
|
||||
start = (max_dim - width) // 2
|
||||
result[:, :, start : start + width] = image
|
||||
else:
|
||||
result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype)
|
||||
for i, color in enumerate(background_color):
|
||||
result[:, :, i] = color
|
||||
if width > height:
|
||||
start = (max_dim - height) // 2
|
||||
result[start : start + height, :, :] = image
|
||||
else:
|
||||
start = (max_dim - width) // 2
|
||||
result[:, start : start + width, :] = image
|
||||
|
||||
image = (
|
||||
to_channel_dimension_format(result, data_format, input_data_format) if data_format is not None else result
|
||||
)
|
||||
return image
|
||||
|
||||
# Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
|
||||
def resize(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
size: Dict[str, int],
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
|
||||
resized to keep the input aspect ratio.
|
||||
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
Image to resize.
|
||||
size (`Dict[str, int]`):
|
||||
Size of the output image.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
|
||||
Resampling filter to use when resiizing the image.
|
||||
data_format (`str` or `ChannelDimension`, *optional*):
|
||||
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format of the input image. If not provided, it will be inferred.
|
||||
"""
|
||||
default_to_square = True
|
||||
if "shortest_edge" in size:
|
||||
size = size["shortest_edge"]
|
||||
default_to_square = False
|
||||
elif "height" in size and "width" in size:
|
||||
size = (size["height"], size["width"])
|
||||
else:
|
||||
raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
|
||||
|
||||
output_size = get_resize_output_image_size(
|
||||
image,
|
||||
size=size,
|
||||
default_to_square=default_to_square,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
return resize(
|
||||
image,
|
||||
size=output_size,
|
||||
resample=resample,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
do_pad: bool = None,
|
||||
do_resize: Optional[bool] = None,
|
||||
size: Optional[Dict[str, int]] = None,
|
||||
resample: Optional[PILImageResampling] = None,
|
||||
do_center_crop: Optional[bool] = None,
|
||||
crop_size: Optional[int] = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_rgb: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
**kwargs,
|
||||
) -> PIL.Image.Image:
|
||||
"""
|
||||
Preprocess an image or batch of images.
|
||||
|
||||
Args:
|
||||
images (`ImageInput`):
|
||||
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
|
||||
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
||||
do_pad (`bool`, *optional*, defaults to `self.do_pad`):
|
||||
Whether to pad the image to a square based on the longest edge.
|
||||
The padding value is determined by the `image_mean` parameter.
|
||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||
Whether to resize the image.
|
||||
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
|
||||
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
|
||||
the longest edge resized to keep the input aspect ratio.
|
||||
resample (`int`, *optional*, defaults to `self.resample`):
|
||||
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
|
||||
has an effect if `do_resize` is set to `True`.
|
||||
do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
|
||||
Whether to center crop the image.
|
||||
crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
|
||||
Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image.
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
||||
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
||||
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
|
||||
`True`.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||
Whether to convert the image to RGB.
|
||||
return_tensors (`str` or `TensorType`, *optional*):
|
||||
The type of tensors to return. Can be one of:
|
||||
- Unset: Return a list of `np.ndarray`.
|
||||
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
|
||||
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
|
||||
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
||||
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
|
||||
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- Unset: Use the channel dimension format of the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
||||
from the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
"""
|
||||
do_pad = do_pad if do_pad is not None else self.do_pad
|
||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||
size = size if size is not None else self.size
|
||||
size = get_size_dict(size, param_name="size", default_to_square=False)
|
||||
resample = resample if resample is not None else self.resample
|
||||
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
|
||||
crop_size = crop_size if crop_size is not None else self.crop_size
|
||||
crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
||||
|
||||
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
|
||||
|
||||
images = make_list_of_images(images)
|
||||
|
||||
if not valid_images(images):
|
||||
raise ValueError(
|
||||
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
|
||||
"torch.Tensor, tf.Tensor or jax.ndarray."
|
||||
)
|
||||
# we don't pass `do_pad` here since LLaVa uses a custom padding to a square
|
||||
validate_preprocess_arguments(
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
do_center_crop=do_center_crop,
|
||||
crop_size=crop_size,
|
||||
do_resize=do_resize,
|
||||
size=size,
|
||||
resample=resample,
|
||||
)
|
||||
|
||||
if do_convert_rgb:
|
||||
images = [convert_to_rgb(image) for image in images]
|
||||
|
||||
# All transformations expect numpy arrays.
|
||||
images = [to_numpy_array(image) for image in images]
|
||||
|
||||
if is_scaled_image(images[0]) and do_rescale:
|
||||
logger.warning_once(
|
||||
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
||||
)
|
||||
|
||||
if input_data_format is None:
|
||||
# We assume that all images have the same channel dimension format.
|
||||
input_data_format = infer_channel_dimension_format(images[0])
|
||||
|
||||
processed_images = []
|
||||
for image in images:
|
||||
if do_pad:
|
||||
image = self.pad_to_square(
|
||||
image=image,
|
||||
background_color=tuple(int(x * 255) for x in self.image_mean),
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
|
||||
if do_resize:
|
||||
image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
|
||||
|
||||
if do_center_crop:
|
||||
image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
|
||||
|
||||
if do_rescale:
|
||||
images = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
|
||||
|
||||
if do_normalize:
|
||||
image = self.normalize(
|
||||
image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
|
||||
)
|
||||
|
||||
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
||||
processed_images.append(image)
|
||||
|
||||
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
||||
|
||||
|
||||
__all__ = ["LlavaImageProcessor"]
|
||||
@@ -39,13 +39,13 @@ class LlavaProcessorKwargs(ProcessingKwargs, total=False):
|
||||
|
||||
class LlavaProcessor(ProcessorMixin):
|
||||
r"""
|
||||
Constructs a Llava processor which wraps a Llava image processor and a Llava tokenizer into a single processor.
|
||||
Constructs a LLaVa processor which wraps a LLaVa image processor and a LLaMa tokenizer into a single processor.
|
||||
|
||||
[`LlavaProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`LlamaTokenizerFast`]. See the
|
||||
[`LlavaProcessor`] offers all the functionalities of [`LlavaImageProcessor`] and [`LlamaTokenizerFast`]. See the
|
||||
[`~LlavaProcessor.__call__`] and [`~LlavaProcessor.decode`] for more information.
|
||||
|
||||
Args:
|
||||
image_processor ([`CLIPImageProcessor`], *optional*):
|
||||
image_processor ([`LlavaImageProcessor`], *optional*):
|
||||
The image processor is a required input.
|
||||
tokenizer ([`LlamaTokenizerFast`], *optional*):
|
||||
The tokenizer is a required input.
|
||||
|
||||
@@ -380,6 +380,13 @@ class LevitImageProcessor(metaclass=DummyObject):
|
||||
requires_backends(self, ["vision"])
|
||||
|
||||
|
||||
class LlavaImageProcessor(metaclass=DummyObject):
|
||||
_backends = ["vision"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["vision"])
|
||||
|
||||
|
||||
class LlavaNextImageProcessor(metaclass=DummyObject):
|
||||
_backends = ["vision"]
|
||||
|
||||
|
||||
203
tests/models/llava/test_image_processing_llava.py
Normal file
203
tests/models/llava/test_image_processing_llava.py
Normal file
@@ -0,0 +1,203 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import unittest
|
||||
from typing import Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_vision_available
|
||||
|
||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import LlavaImageProcessor
|
||||
|
||||
|
||||
class LlavaImageProcessingTester(unittest.TestCase):
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=7,
|
||||
num_channels=3,
|
||||
image_size=18,
|
||||
min_resolution=30,
|
||||
max_resolution=400,
|
||||
do_pad=True,
|
||||
do_resize=True,
|
||||
size=None,
|
||||
do_center_crop=True,
|
||||
crop_size=None,
|
||||
do_normalize=True,
|
||||
image_mean=[0.48145466, 0.4578275, 0.40821073],
|
||||
image_std=[0.26862954, 0.26130258, 0.27577711],
|
||||
do_convert_rgb=True,
|
||||
):
|
||||
size = size if size is not None else {"shortest_edge": 20}
|
||||
crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.num_channels = num_channels
|
||||
self.image_size = image_size
|
||||
self.min_resolution = min_resolution
|
||||
self.max_resolution = max_resolution
|
||||
self.do_pad = do_pad
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.do_center_crop = do_center_crop
|
||||
self.crop_size = crop_size
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
def prepare_image_processor_dict(self):
|
||||
return {
|
||||
"do_pad": self.do_pad,
|
||||
"do_resize": self.do_resize,
|
||||
"size": self.size,
|
||||
"do_center_crop": self.do_center_crop,
|
||||
"crop_size": self.crop_size,
|
||||
"do_normalize": self.do_normalize,
|
||||
"image_mean": self.image_mean,
|
||||
"image_std": self.image_std,
|
||||
"do_convert_rgb": self.do_convert_rgb,
|
||||
}
|
||||
|
||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.expected_output_image_shape
|
||||
def expected_output_image_shape(self, images):
|
||||
return self.num_channels, self.crop_size["height"], self.crop_size["width"]
|
||||
|
||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs
|
||||
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
|
||||
return prepare_image_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_channels=self.num_channels,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
numpify=numpify,
|
||||
torchify=torchify,
|
||||
)
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest with CLIP->Llava
|
||||
class LlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = LlavaImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = LlavaImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
def image_processor_dict(self):
|
||||
return self.image_processor_tester.prepare_image_processor_dict()
|
||||
|
||||
# Ignore copy
|
||||
def test_image_processor_properties(self):
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
self.assertTrue(hasattr(image_processing, "do_pad"))
|
||||
self.assertTrue(hasattr(image_processing, "do_resize"))
|
||||
self.assertTrue(hasattr(image_processing, "size"))
|
||||
self.assertTrue(hasattr(image_processing, "do_center_crop"))
|
||||
self.assertTrue(hasattr(image_processing, "center_crop"))
|
||||
self.assertTrue(hasattr(image_processing, "do_normalize"))
|
||||
self.assertTrue(hasattr(image_processing, "image_mean"))
|
||||
self.assertTrue(hasattr(image_processing, "image_std"))
|
||||
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
|
||||
|
||||
def test_image_processor_from_dict_with_kwargs(self):
|
||||
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
|
||||
self.assertEqual(image_processor.size, {"shortest_edge": 20})
|
||||
self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
|
||||
|
||||
image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
|
||||
self.assertEqual(image_processor.size, {"shortest_edge": 42})
|
||||
self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
|
||||
|
||||
# Ignore copy
|
||||
def test_padding(self):
|
||||
"""
|
||||
LLaVA needs to pad images to square size before processing as per orig implementation.
|
||||
Checks that image processor pads images correctly given different background colors.
|
||||
"""
|
||||
|
||||
# taken from original implementation: https://github.com/haotian-liu/LLaVA/blob/c121f0432da27facab705978f83c4ada465e46fd/llava/mm_utils.py#L152
|
||||
def pad_to_square_original(
|
||||
image: Image.Image, background_color: Union[int, Tuple[int, int, int]] = 0
|
||||
) -> Image.Image:
|
||||
width, height = image.size
|
||||
if width == height:
|
||||
return image
|
||||
elif width > height:
|
||||
result = Image.new(image.mode, (width, width), background_color)
|
||||
result.paste(image, (0, (width - height) // 2))
|
||||
return result
|
||||
else:
|
||||
result = Image.new(image.mode, (height, height), background_color)
|
||||
result.paste(image, ((height - width) // 2, 0))
|
||||
return result
|
||||
|
||||
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
|
||||
|
||||
# test with images in channel-last and channel-first format
|
||||
for image in image_inputs:
|
||||
padded_image = image_processor.pad_to_square(image)
|
||||
padded_image_original = pad_to_square_original(Image.fromarray(image))
|
||||
padded_image_original = np.array(padded_image_original)
|
||||
|
||||
np.testing.assert_allclose(padded_image, padded_image_original)
|
||||
|
||||
padded_image = image_processor.pad_to_square(image.transpose(2, 0, 1), input_data_format="channels_first")
|
||||
padded_image = padded_image.transpose(1, 2, 0)
|
||||
|
||||
np.testing.assert_allclose(padded_image, padded_image_original)
|
||||
|
||||
# test background color
|
||||
background_color = (122, 116, 104)
|
||||
for image in image_inputs:
|
||||
padded_image = image_processor.pad_to_square(image, background_color=background_color)
|
||||
padded_image_original = pad_to_square_original(Image.fromarray(image), background_color=background_color)
|
||||
padded_image_original = np.array(padded_image_original)
|
||||
|
||||
np.testing.assert_allclose(padded_image, padded_image_original)
|
||||
|
||||
background_color = 122
|
||||
for image in image_inputs:
|
||||
padded_image = image_processor.pad_to_square(image, background_color=background_color)
|
||||
padded_image_original = pad_to_square_original(Image.fromarray(image), background_color=background_color)
|
||||
padded_image_original = np.array(padded_image_original)
|
||||
|
||||
np.testing.assert_allclose(padded_image, padded_image_original)
|
||||
|
||||
# background color length should match channel length
|
||||
with self.assertRaises(ValueError):
|
||||
padded_image = image_processor.pad_to_square(image_inputs[0], background_color=(122, 104))
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
padded_image = image_processor.pad_to_square(image_inputs[0], background_color=(122, 104, 0, 0))
|
||||
|
||||
@unittest.skip(reason="LLaVa does not support 4 channel images yet")
|
||||
# Ignore copy
|
||||
def test_call_numpy_4_channels(self):
|
||||
pass
|
||||
Reference in New Issue
Block a user