Add fast image processor Janus, Deepseek VL, Deepseek VL hybrid (#39739)
* add fast image processor Janus, deepseek_vl, deepseek_vl_hybrid * fix after review
This commit is contained in:
@@ -209,6 +209,10 @@ model = DeepseekVLForConditionalGeneration.from_pretrained(
|
||||
|
||||
[[autodoc]] DeepseekVLImageProcessor
|
||||
|
||||
## DeepseekVLImageProcessorFast
|
||||
|
||||
[[autodoc]] DeepseekVLImageProcessorFast
|
||||
|
||||
## DeepseekVLModel
|
||||
|
||||
[[autodoc]] DeepseekVLModel
|
||||
|
||||
@@ -208,6 +208,10 @@ model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
|
||||
|
||||
[[autodoc]] DeepseekVLHybridImageProcessor
|
||||
|
||||
## DeepseekVLHybridImageProcessorFast
|
||||
|
||||
[[autodoc]] DeepseekVLHybridImageProcessorFast
|
||||
|
||||
## DeepseekVLHybridModel
|
||||
|
||||
[[autodoc]] DeepseekVLHybridModel
|
||||
|
||||
@@ -209,6 +209,10 @@ for i, image in enumerate(images['pixel_values']):
|
||||
|
||||
[[autodoc]] JanusImageProcessor
|
||||
|
||||
## JanusImageProcessorFast
|
||||
|
||||
[[autodoc]] JanusImageProcessorFast
|
||||
|
||||
## JanusVisionModel
|
||||
|
||||
[[autodoc]] JanusVisionModel
|
||||
|
||||
@@ -78,8 +78,8 @@ else:
|
||||
("convnextv2", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
|
||||
("cvt", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
|
||||
("data2vec-vision", ("BeitImageProcessor", "BeitImageProcessorFast")),
|
||||
("deepseek_vl", ("DeepseekVLImageProcessor")),
|
||||
("deepseek_vl_hybrid", ("DeepseekVLHybridImageProcessor")),
|
||||
("deepseek_vl", ("DeepseekVLImageProcessor", "DeepseekVLImageProcessorFast")),
|
||||
("deepseek_vl_hybrid", ("DeepseekVLHybridImageProcessor", "DeepseekVLHybridImageProcessorFast")),
|
||||
("deformable_detr", ("DeformableDetrImageProcessor", "DeformableDetrImageProcessorFast")),
|
||||
("deit", ("DeiTImageProcessor", "DeiTImageProcessorFast")),
|
||||
("depth_anything", ("DPTImageProcessor", "DPTImageProcessorFast")),
|
||||
@@ -113,7 +113,7 @@ else:
|
||||
("imagegpt", ("ImageGPTImageProcessor",)),
|
||||
("instructblip", ("BlipImageProcessor", "BlipImageProcessorFast")),
|
||||
("instructblipvideo", ("InstructBlipVideoImageProcessor",)),
|
||||
("janus", ("JanusImageProcessor")),
|
||||
("janus", ("JanusImageProcessor", "JanusImageProcessorFast")),
|
||||
("kosmos-2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
|
||||
("layoutlmv2", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessorFast")),
|
||||
("layoutlmv3", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")),
|
||||
|
||||
@@ -20,7 +20,9 @@
|
||||
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
from ...utils import (
|
||||
logging,
|
||||
)
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
|
||||
@@ -131,6 +131,7 @@ class DeepseekVLImageProcessor(BaseImageProcessor):
|
||||
self,
|
||||
image: np.ndarray,
|
||||
size: Union[dict[str, int], int],
|
||||
background_color: Optional[tuple[int, int, int]] = None,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
@@ -142,6 +143,10 @@ class DeepseekVLImageProcessor(BaseImageProcessor):
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
Image to resize.
|
||||
size (`dict[str, int]` or `int`):
|
||||
The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
|
||||
background_color (`tuple[int, int, int]`):
|
||||
The background color to use for the padding.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
|
||||
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
|
||||
data_format (`ChannelDimension` or `str`, *optional*):
|
||||
@@ -160,6 +165,7 @@ class DeepseekVLImageProcessor(BaseImageProcessor):
|
||||
Returns:
|
||||
`np.ndarray`: The resized image.
|
||||
"""
|
||||
background_color = background_color if background_color is not None else self.background_color
|
||||
if input_data_format is None:
|
||||
input_data_format = infer_channel_dimension_format(image)
|
||||
|
||||
@@ -191,7 +197,7 @@ class DeepseekVLImageProcessor(BaseImageProcessor):
|
||||
# Expand and pad the images to obtain a square image of dimensions `size x size`
|
||||
image = self.pad_to_square(
|
||||
image=image,
|
||||
background_color=self.background_color,
|
||||
background_color=background_color,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
return image
|
||||
@@ -406,9 +412,5 @@ class DeepseekVLImageProcessor(BaseImageProcessor):
|
||||
|
||||
return result
|
||||
|
||||
def postprocess(self):
|
||||
"""Applies post-processing to the decoded image tokens by reversing transformations applied during preprocessing."""
|
||||
raise AttributeError("Not needed for DeepseekVL")
|
||||
|
||||
|
||||
__all__ = ["DeepseekVLImageProcessor"]
|
||||
|
||||
@@ -0,0 +1,199 @@
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py.
|
||||
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_deepseek_vl.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
auto_docstring,
|
||||
is_torch_available,
|
||||
)
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
class DeepseekVLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
r"""
|
||||
min_size (`int`, *optional*, defaults to 14):
|
||||
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
||||
falls below this value after resizing.
|
||||
"""
|
||||
|
||||
min_size: int
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class DeepseekVLImageProcessorFast(BaseImageProcessorFast):
|
||||
resample = PILImageResampling.BICUBIC
|
||||
image_mean = OPENAI_CLIP_MEAN
|
||||
image_std = OPENAI_CLIP_STD
|
||||
size = {"height": 384, "width": 384}
|
||||
min_size = 14
|
||||
do_resize = True
|
||||
do_rescale = True
|
||||
do_normalize = True
|
||||
valid_kwargs = DeepseekVLFastImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[DeepseekVLFastImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
if kwargs.get("image_mean", None) is None:
|
||||
background_color = (127, 127, 127)
|
||||
else:
|
||||
background_color = tuple([int(x * 255) for x in kwargs.get("image_mean")])
|
||||
self.background_color = tuple(background_color)
|
||||
|
||||
def resize(
|
||||
self,
|
||||
image: "torch.Tensor",
|
||||
size: SizeDict,
|
||||
min_size: int,
|
||||
interpolation: "F.InterpolationMode" = None,
|
||||
antialias: bool = True,
|
||||
**kwargs,
|
||||
) -> "torch.Tensor":
|
||||
if size.height is None or size.width is None or size.height != size.width:
|
||||
raise ValueError(
|
||||
f"Output height and width must be the same. Got height={size['height']} and width={size['width']}"
|
||||
)
|
||||
size = size.height
|
||||
|
||||
height, width = image.shape[-2:]
|
||||
max_size = max(height, width)
|
||||
|
||||
delta = size / max_size
|
||||
# Largest side becomes `size` and the other side is scaled according to the aspect ratio.
|
||||
output_size_nonpadded = SizeDict(
|
||||
height=max(int(height * delta), min_size),
|
||||
width=max(int(width * delta), min_size),
|
||||
)
|
||||
|
||||
return super().resize(image, size=output_size_nonpadded, interpolation=interpolation, antialias=antialias)
|
||||
|
||||
def pad_to_square(
|
||||
self,
|
||||
images: "torch.Tensor",
|
||||
background_color: Union[int, tuple[int, int, int]] = 0,
|
||||
) -> "torch.Tensor":
|
||||
"""
|
||||
Pads an image to a square based on the longest edge.
|
||||
|
||||
Args:
|
||||
images (`torch.Tensor`):
|
||||
The images to pad.
|
||||
background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
|
||||
The color to use for the padding. Can be an integer for single channel or a
|
||||
tuple of integers representing for multi-channel images. If passed as integer
|
||||
in mutli-channel mode, it will default to `0` in subsequent channels.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: The padded images.
|
||||
"""
|
||||
height, width = images.shape[-2:]
|
||||
num_channels = images.shape[1]
|
||||
batch_size = images.shape[0]
|
||||
|
||||
if height == width:
|
||||
return images
|
||||
|
||||
max_dim = max(height, width)
|
||||
|
||||
# Ensure background_color is the correct shape
|
||||
if isinstance(background_color, int):
|
||||
background_color = [background_color]
|
||||
elif len(background_color) != num_channels:
|
||||
raise ValueError(
|
||||
f"background_color must have no more than {num_channels} elements to match the number of channels"
|
||||
)
|
||||
|
||||
padded_images = torch.zeros(
|
||||
(batch_size, num_channels, max_dim, max_dim), dtype=images.dtype, device=images.device
|
||||
)
|
||||
for i, color in enumerate(background_color):
|
||||
padded_images[:, i, :, :] = color
|
||||
if width > height:
|
||||
start = (max_dim - height) // 2
|
||||
padded_images[:, :, start : start + height, :] = images
|
||||
else:
|
||||
start = (max_dim - width) // 2
|
||||
padded_images[:, :, :, start : start + width] = images
|
||||
|
||||
return padded_images
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: list["torch.Tensor"],
|
||||
do_resize: bool,
|
||||
size: SizeDict,
|
||||
min_size: int,
|
||||
interpolation: Optional["F.InterpolationMode"],
|
||||
do_rescale: bool,
|
||||
rescale_factor: float,
|
||||
do_normalize: bool,
|
||||
image_mean: Optional[Union[float, list[float]]],
|
||||
image_std: Optional[Union[float, list[float]]],
|
||||
disable_grouping: Optional[bool],
|
||||
return_tensors: Optional[Union[str, TensorType]],
|
||||
do_pad: bool = True,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
# Group images by size for batched resizing
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
|
||||
resized_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_resize:
|
||||
stacked_images = self.resize(
|
||||
image=stacked_images, size=size, min_size=min_size, interpolation=interpolation
|
||||
)
|
||||
resized_images_grouped[shape] = stacked_images
|
||||
resized_images = reorder_images(resized_images_grouped, grouped_images_index)
|
||||
|
||||
# Group images by size for further processing
|
||||
# Needed in case do_resize is False, or resize returns images with different sizes
|
||||
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
|
||||
processed_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_pad:
|
||||
stacked_images = self.pad_to_square(stacked_images, background_color=self.background_color)
|
||||
# Fused rescale and normalize
|
||||
stacked_images = self.rescale_and_normalize(
|
||||
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
||||
)
|
||||
processed_images_grouped[shape] = stacked_images
|
||||
|
||||
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
||||
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
||||
|
||||
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
||||
|
||||
|
||||
__all__ = ["DeepseekVLImageProcessorFast"]
|
||||
@@ -33,6 +33,7 @@ from ...utils import (
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel
|
||||
from ..idefics.modeling_idefics import IdeficsBaseModelOutputWithPast, IdeficsCausalLMOutputWithPast
|
||||
from ..janus.image_processing_janus import JanusImageProcessor
|
||||
from ..janus.image_processing_janus_fast import JanusImageProcessorFast
|
||||
from ..janus.modeling_janus import JanusForConditionalGeneration, JanusModel, JanusPreTrainedModel
|
||||
|
||||
|
||||
@@ -181,6 +182,9 @@ class DeepseekVLForConditionalGeneration(JanusForConditionalGeneration):
|
||||
|
||||
|
||||
class DeepseekVLImageProcessor(JanusImageProcessor):
|
||||
def __init__(self, **super_kwargs):
|
||||
super().__init__(**super_kwargs)
|
||||
|
||||
def postprocess(self):
|
||||
raise AttributeError("Not needed for DeepseekVL")
|
||||
|
||||
@@ -188,6 +192,14 @@ class DeepseekVLImageProcessor(JanusImageProcessor):
|
||||
raise AttributeError("Not needed for DeepseekVL")
|
||||
|
||||
|
||||
class DeepseekVLImageProcessorFast(JanusImageProcessorFast):
|
||||
def __init__(self, **super_kwargs):
|
||||
super().__init__(**super_kwargs)
|
||||
|
||||
def postprocess(self):
|
||||
raise AttributeError("Not needed for DeepseekVL")
|
||||
|
||||
|
||||
class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False):
|
||||
_defaults = {
|
||||
"text_kwargs": {"padding": False},
|
||||
@@ -322,5 +334,6 @@ __all__ = [
|
||||
"DeepseekVLModel",
|
||||
"DeepseekVLForConditionalGeneration",
|
||||
"DeepseekVLImageProcessor",
|
||||
"DeepseekVLImageProcessorFast",
|
||||
"DeepseekVLProcessor",
|
||||
]
|
||||
|
||||
@@ -21,6 +21,7 @@ if TYPE_CHECKING:
|
||||
from .configuration_deepseek_vl_hybrid import *
|
||||
from .image_processing_deepseek_vl_fast_hybrid import *
|
||||
from .image_processing_deepseek_vl_hybrid import *
|
||||
from .image_processing_deepseek_vl_hybrid_fast import *
|
||||
from .modeling_deepseek_vl_hybrid import *
|
||||
from .processing_deepseek_vl_hybrid import *
|
||||
else:
|
||||
|
||||
@@ -154,14 +154,15 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
|
||||
self.background_color = tuple([int(x * 255) for x in image_mean])
|
||||
|
||||
if high_res_image_mean is None:
|
||||
self.background_color = (127, 127, 127)
|
||||
self.high_res_background_color = (127, 127, 127)
|
||||
else:
|
||||
self.background_color = tuple([int(x * 255) for x in high_res_image_mean])
|
||||
self.high_res_background_color = tuple([int(x * 255) for x in high_res_image_mean])
|
||||
|
||||
def resize(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
size: Union[dict[str, int], int],
|
||||
background_color: Optional[tuple[int, int, int]] = None,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
@@ -173,6 +174,10 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
Image to resize.
|
||||
size (`dict[str, int]` or `int`):
|
||||
The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
|
||||
background_color (`tuple[int, int, int]`):
|
||||
The background color to use for the padding.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
|
||||
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
|
||||
data_format (`ChannelDimension` or `str`, *optional*):
|
||||
@@ -191,6 +196,7 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
|
||||
Returns:
|
||||
`np.ndarray`: The resized image.
|
||||
"""
|
||||
background_color = background_color if background_color is not None else self.background_color
|
||||
if input_data_format is None:
|
||||
input_data_format = infer_channel_dimension_format(image)
|
||||
|
||||
@@ -222,7 +228,7 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
|
||||
# Expand and pad the images to obtain a square image of dimensions `size x size`
|
||||
image = self.pad_to_square(
|
||||
image=image,
|
||||
background_color=self.background_color,
|
||||
background_color=background_color,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
return image
|
||||
@@ -361,16 +367,20 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
|
||||
# high_res_image: resize (high) -> rescale -> normalize (high)
|
||||
# low_res_image: resize (high) -> rescale -> resize (low) -> normalize (low)
|
||||
high_res_image = image
|
||||
|
||||
if do_resize:
|
||||
high_res_image = self.resize(
|
||||
image=high_res_image,
|
||||
size=high_res_size_dict,
|
||||
background_color=self.high_res_background_color,
|
||||
resample=high_res_resample,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
image = self.resize(
|
||||
image=high_res_image, size=size_dict, resample=resample, input_data_format=input_data_format
|
||||
image=high_res_image,
|
||||
size=size_dict,
|
||||
background_color=self.background_color,
|
||||
resample=resample,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
|
||||
if do_rescale:
|
||||
@@ -475,9 +485,5 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
|
||||
|
||||
return result
|
||||
|
||||
def postprocess(self):
|
||||
"""Applies post-processing to the decoded image tokens by reversing transformations applied during preprocessing."""
|
||||
raise AttributeError("Not needed for DeepseekVLHybrid")
|
||||
|
||||
|
||||
__all__ = ["DeepseekVLHybridImageProcessor"]
|
||||
|
||||
@@ -0,0 +1,326 @@
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# This file was automatically generated from src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py.
|
||||
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_deepseek_vl_hybrid.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
BatchFeature,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
get_size_dict,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ChannelDimension, PILImageResampling, SizeDict
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
auto_docstring,
|
||||
is_torchvision_available,
|
||||
is_torchvision_v2_available,
|
||||
)
|
||||
|
||||
|
||||
if is_torchvision_v2_available():
|
||||
from torchvision.transforms.v2 import functional as F
|
||||
|
||||
from ...image_utils import pil_torch_interpolation_mapping
|
||||
elif is_torchvision_available():
|
||||
from torchvision.transforms import functional as F
|
||||
|
||||
from ...image_utils import pil_torch_interpolation_mapping
|
||||
|
||||
|
||||
class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
r"""
|
||||
min_size (`int`, *optional*, defaults to 14):
|
||||
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
||||
falls below this value after resizing.
|
||||
high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
|
||||
Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
|
||||
method.
|
||||
high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
||||
Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
|
||||
overridden by the `high_res_resample` parameter in the `preprocess` method.
|
||||
high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
|
||||
Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
|
||||
channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
|
||||
high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
|
||||
Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
|
||||
number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
|
||||
"""
|
||||
|
||||
min_size: int
|
||||
high_res_size: dict
|
||||
high_res_resample: "PILImageResampling"
|
||||
high_res_image_mean: list[float]
|
||||
high_res_image_std: list[float]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class DeepseekVLHybridImageProcessorFast(BaseImageProcessorFast):
|
||||
resample = PILImageResampling.BICUBIC
|
||||
image_mean = OPENAI_CLIP_MEAN
|
||||
image_std = OPENAI_CLIP_STD
|
||||
size = {"height": 384, "width": 384}
|
||||
min_size = 14
|
||||
do_resize = True
|
||||
do_rescale = True
|
||||
do_normalize = True
|
||||
valid_kwargs = DeepseekVLHybridFastImageProcessorKwargs
|
||||
high_res_image_mean = OPENAI_CLIP_MEAN
|
||||
high_res_image_std = OPENAI_CLIP_STD
|
||||
high_res_size = {"height": 1024, "width": 1024}
|
||||
high_res_resample = PILImageResampling.BICUBIC
|
||||
|
||||
def __init__(self, **kwargs: Unpack[DeepseekVLHybridFastImageProcessorKwargs]):
|
||||
if kwargs.get("image_mean", None) is None:
|
||||
background_color = (127, 127, 127)
|
||||
else:
|
||||
background_color = tuple([int(x * 255) for x in kwargs.get("image_mean")])
|
||||
if kwargs.get("high_res_image_mean", None) is None:
|
||||
high_res_background_color = (127, 127, 127)
|
||||
else:
|
||||
high_res_background_color = tuple([int(x * 255) for x in kwargs.get("high_res_image_mean")])
|
||||
super().__init__(**kwargs)
|
||||
self.background_color = tuple(background_color)
|
||||
self.high_res_background_color = tuple(high_res_background_color)
|
||||
|
||||
def resize(
|
||||
self,
|
||||
image: "torch.Tensor",
|
||||
size: SizeDict,
|
||||
min_size: int,
|
||||
interpolation: "F.InterpolationMode" = None,
|
||||
antialias: bool = True,
|
||||
**kwargs,
|
||||
) -> "torch.Tensor":
|
||||
if size.height is None or size.width is None or size.height != size.width:
|
||||
raise ValueError(
|
||||
f"Output height and width must be the same. Got height={size['height']} and width={size['width']}"
|
||||
)
|
||||
size = size.height
|
||||
|
||||
height, width = image.shape[-2:]
|
||||
max_size = max(height, width)
|
||||
|
||||
delta = size / max_size
|
||||
# Largest side becomes `size` and the other side is scaled according to the aspect ratio.
|
||||
output_size_nonpadded = SizeDict(
|
||||
height=max(int(height * delta), min_size),
|
||||
width=max(int(width * delta), min_size),
|
||||
)
|
||||
|
||||
return super().resize(image, size=output_size_nonpadded, interpolation=interpolation, antialias=antialias)
|
||||
|
||||
def pad_to_square(
|
||||
self,
|
||||
images: "torch.Tensor",
|
||||
background_color: Union[int, tuple[int, int, int]] = 0,
|
||||
) -> "torch.Tensor":
|
||||
"""
|
||||
Pads an image to a square based on the longest edge.
|
||||
|
||||
Args:
|
||||
images (`torch.Tensor`):
|
||||
The images to pad.
|
||||
background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
|
||||
The color to use for the padding. Can be an integer for single channel or a
|
||||
tuple of integers representing for multi-channel images. If passed as integer
|
||||
in mutli-channel mode, it will default to `0` in subsequent channels.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: The padded images.
|
||||
"""
|
||||
height, width = images.shape[-2:]
|
||||
num_channels = images.shape[1]
|
||||
batch_size = images.shape[0]
|
||||
|
||||
if height == width:
|
||||
return images
|
||||
|
||||
max_dim = max(height, width)
|
||||
|
||||
# Ensure background_color is the correct shape
|
||||
if isinstance(background_color, int):
|
||||
background_color = [background_color]
|
||||
elif len(background_color) != num_channels:
|
||||
raise ValueError(
|
||||
f"background_color must have no more than {num_channels} elements to match the number of channels"
|
||||
)
|
||||
|
||||
padded_images = torch.zeros(
|
||||
(batch_size, num_channels, max_dim, max_dim), dtype=images.dtype, device=images.device
|
||||
)
|
||||
for i, color in enumerate(background_color):
|
||||
padded_images[:, i, :, :] = color
|
||||
if width > height:
|
||||
start = (max_dim - height) // 2
|
||||
padded_images[:, :, start : start + height, :] = images
|
||||
else:
|
||||
start = (max_dim - width) // 2
|
||||
padded_images[:, :, :, start : start + width] = images
|
||||
|
||||
return padded_images
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: list["torch.Tensor"],
|
||||
do_resize: bool,
|
||||
size: SizeDict,
|
||||
high_res_size: SizeDict,
|
||||
min_size: int,
|
||||
interpolation: Optional["F.InterpolationMode"],
|
||||
high_res_interpolation: Optional["F.InterpolationMode"],
|
||||
do_rescale: bool,
|
||||
rescale_factor: float,
|
||||
do_normalize: bool,
|
||||
image_mean: Optional[Union[float, list[float]]],
|
||||
image_std: Optional[Union[float, list[float]]],
|
||||
high_res_image_mean: Optional[Union[float, list[float]]],
|
||||
high_res_image_std: Optional[Union[float, list[float]]],
|
||||
disable_grouping: Optional[bool],
|
||||
return_tensors: Optional[Union[str, TensorType]],
|
||||
do_pad: bool = True,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
# Group images by size for batched resizing
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
|
||||
high_res_resized_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_resize:
|
||||
stacked_high_res_images = self.resize(
|
||||
image=stacked_images, size=high_res_size, min_size=min_size, interpolation=high_res_interpolation
|
||||
)
|
||||
high_res_resized_images_grouped[shape] = stacked_high_res_images
|
||||
high_res_resized_images = reorder_images(high_res_resized_images_grouped, grouped_images_index)
|
||||
|
||||
# Group images by size for further processing
|
||||
# Needed in case do_resize is False, or resize returns images with different sizes
|
||||
grouped_high_res_images, grouped_high_res_images_index = group_images_by_shape(
|
||||
high_res_resized_images, disable_grouping=disable_grouping
|
||||
)
|
||||
high_res_padded_images = {}
|
||||
high_res_processed_images_grouped = {}
|
||||
for shape, stacked_high_res_images in grouped_high_res_images.items():
|
||||
if do_pad:
|
||||
stacked_high_res_images = self.pad_to_square(
|
||||
stacked_high_res_images, background_color=self.high_res_background_color
|
||||
)
|
||||
high_res_padded_images[shape] = stacked_high_res_images
|
||||
# Fused rescale and normalize
|
||||
stacked_high_res_images = self.rescale_and_normalize(
|
||||
stacked_high_res_images,
|
||||
do_rescale,
|
||||
rescale_factor,
|
||||
do_normalize,
|
||||
high_res_image_mean,
|
||||
high_res_image_std,
|
||||
)
|
||||
high_res_processed_images_grouped[shape] = stacked_high_res_images
|
||||
high_res_processed_images = reorder_images(high_res_processed_images_grouped, grouped_high_res_images_index)
|
||||
high_res_processed_images = (
|
||||
torch.stack(high_res_processed_images, dim=0) if return_tensors else high_res_processed_images
|
||||
)
|
||||
|
||||
resized_images_grouped = {}
|
||||
for shape, stacked_high_res_padded_images in high_res_padded_images.items():
|
||||
if do_resize:
|
||||
stacked_images = self.resize(
|
||||
image=stacked_high_res_padded_images, size=size, min_size=min_size, interpolation=interpolation
|
||||
)
|
||||
resized_images_grouped[shape] = stacked_images
|
||||
resized_images = reorder_images(resized_images_grouped, grouped_high_res_images_index)
|
||||
|
||||
grouped_resized_images, grouped_resized_images_index = group_images_by_shape(
|
||||
resized_images, disable_grouping=disable_grouping
|
||||
)
|
||||
processed_images_grouped = {}
|
||||
for shape, stacked_images in grouped_resized_images.items():
|
||||
if do_pad:
|
||||
stacked_images = self.pad_to_square(stacked_images, background_color=self.background_color)
|
||||
# Fused rescale and normalize
|
||||
stacked_images = self.rescale_and_normalize(
|
||||
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
||||
)
|
||||
processed_images_grouped[shape] = stacked_images
|
||||
processed_images = reorder_images(processed_images_grouped, grouped_resized_images_index)
|
||||
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
||||
|
||||
return BatchFeature(
|
||||
data={"pixel_values": processed_images, "high_res_pixel_values": high_res_processed_images},
|
||||
tensor_type=return_tensors,
|
||||
)
|
||||
|
||||
def _further_process_kwargs(
|
||||
self,
|
||||
size: Optional[SizeDict] = None,
|
||||
high_res_size: Optional[SizeDict] = None,
|
||||
default_to_square: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, list[float]]] = None,
|
||||
image_std: Optional[Union[float, list[float]]] = None,
|
||||
high_res_image_mean: Optional[Union[float, list[float]]] = None,
|
||||
high_res_image_std: Optional[Union[float, list[float]]] = None,
|
||||
data_format: Optional[ChannelDimension] = None,
|
||||
**kwargs,
|
||||
) -> dict:
|
||||
"""
|
||||
Update kwargs that need further processing before being validated
|
||||
Can be overridden by subclasses to customize the processing of kwargs.
|
||||
"""
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
if size is not None:
|
||||
size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square))
|
||||
if high_res_size is not None:
|
||||
high_res_size = SizeDict(**get_size_dict(size=high_res_size, default_to_square=default_to_square))
|
||||
if isinstance(image_mean, list):
|
||||
image_mean = tuple(image_mean)
|
||||
if isinstance(image_std, list):
|
||||
image_std = tuple(image_std)
|
||||
if isinstance(high_res_image_mean, list):
|
||||
high_res_image_mean = tuple(high_res_image_mean)
|
||||
if isinstance(high_res_image_std, list):
|
||||
high_res_image_std = tuple(high_res_image_std)
|
||||
if data_format is None:
|
||||
data_format = ChannelDimension.FIRST
|
||||
|
||||
high_res_resample = kwargs.pop("high_res_resample")
|
||||
kwargs["high_res_interpolation"] = (
|
||||
pil_torch_interpolation_mapping[high_res_resample]
|
||||
if isinstance(high_res_resample, (int, PILImageResampling))
|
||||
else high_res_resample
|
||||
)
|
||||
|
||||
kwargs["size"] = size
|
||||
kwargs["high_res_size"] = high_res_size
|
||||
kwargs["default_to_square"] = default_to_square
|
||||
kwargs["image_mean"] = image_mean
|
||||
kwargs["image_std"] = image_std
|
||||
kwargs["high_res_image_mean"] = high_res_image_mean
|
||||
kwargs["high_res_image_std"] = high_res_image_std
|
||||
kwargs["data_format"] = data_format
|
||||
|
||||
return kwargs
|
||||
|
||||
|
||||
__all__ = ["DeepseekVLHybridImageProcessorFast"]
|
||||
@@ -20,7 +20,10 @@ import torch.nn as nn
|
||||
from ...cache_utils import Cache
|
||||
from ...image_processing_utils_fast import (
|
||||
BatchFeature,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
get_size_dict,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
from ...image_transforms import convert_to_rgb, to_channel_dimension_format
|
||||
from ...image_utils import (
|
||||
@@ -29,6 +32,7 @@ from ...image_utils import (
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
SizeDict,
|
||||
infer_channel_dimension_format,
|
||||
is_scaled_image,
|
||||
make_flat_list_of_images,
|
||||
@@ -48,11 +52,14 @@ from ...utils import (
|
||||
auto_docstring,
|
||||
can_return_tuple,
|
||||
filter_out_non_signature_kwargs,
|
||||
is_torchvision_available,
|
||||
is_torchvision_v2_available,
|
||||
logging,
|
||||
)
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel
|
||||
from ..deepseek_vl.configuration_deepseek_vl import DeepseekVLConfig
|
||||
from ..deepseek_vl.image_processing_deepseek_vl import DeepseekVLImageProcessor
|
||||
from ..deepseek_vl.image_processing_deepseek_vl_fast import DeepseekVLImageProcessorFast
|
||||
from ..deepseek_vl.modeling_deepseek_vl import (
|
||||
DeepseekVLForConditionalGeneration,
|
||||
DeepseekVLModel,
|
||||
@@ -63,6 +70,16 @@ from ..idefics.modeling_idefics import IdeficsBaseModelOutputWithPast, IdeficsCa
|
||||
from ..sam.modeling_sam import SamLayerNorm, SamVisionNeck
|
||||
|
||||
|
||||
if is_torchvision_v2_available():
|
||||
from torchvision.transforms.v2 import functional as F
|
||||
|
||||
from ...image_utils import pil_torch_interpolation_mapping
|
||||
elif is_torchvision_available():
|
||||
from torchvision.transforms import functional as F
|
||||
|
||||
from ...image_utils import pil_torch_interpolation_mapping
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@@ -516,9 +533,9 @@ class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
|
||||
)
|
||||
|
||||
if high_res_image_mean is None:
|
||||
self.background_color = (127, 127, 127)
|
||||
self.high_res_background_color = (127, 127, 127)
|
||||
else:
|
||||
self.background_color = tuple([int(x * 255) for x in high_res_image_mean])
|
||||
self.high_res_background_color = tuple([int(x * 255) for x in high_res_image_mean])
|
||||
|
||||
@filter_out_non_signature_kwargs()
|
||||
def preprocess(
|
||||
@@ -654,16 +671,20 @@ class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
|
||||
# high_res_image: resize (high) -> rescale -> normalize (high)
|
||||
# low_res_image: resize (high) -> rescale -> resize (low) -> normalize (low)
|
||||
high_res_image = image
|
||||
|
||||
if do_resize:
|
||||
high_res_image = self.resize(
|
||||
image=high_res_image,
|
||||
size=high_res_size_dict,
|
||||
background_color=self.high_res_background_color,
|
||||
resample=high_res_resample,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
image = self.resize(
|
||||
image=high_res_image, size=size_dict, resample=resample, input_data_format=input_data_format
|
||||
image=high_res_image,
|
||||
size=size_dict,
|
||||
background_color=self.background_color,
|
||||
resample=resample,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
|
||||
if do_rescale:
|
||||
@@ -695,6 +716,192 @@ class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
|
||||
class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
r"""
|
||||
min_size (`int`, *optional*, defaults to 14):
|
||||
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
||||
falls below this value after resizing.
|
||||
high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
|
||||
Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
|
||||
method.
|
||||
high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
||||
Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
|
||||
overridden by the `high_res_resample` parameter in the `preprocess` method.
|
||||
high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
|
||||
Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
|
||||
channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
|
||||
high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
|
||||
Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
|
||||
number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
|
||||
"""
|
||||
|
||||
min_size: int
|
||||
high_res_size: dict
|
||||
high_res_resample: "PILImageResampling"
|
||||
high_res_image_mean: list[float]
|
||||
high_res_image_std: list[float]
|
||||
|
||||
|
||||
class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast):
|
||||
high_res_image_mean = OPENAI_CLIP_MEAN
|
||||
high_res_image_std = OPENAI_CLIP_STD
|
||||
high_res_size = {"height": 1024, "width": 1024}
|
||||
high_res_resample = PILImageResampling.BICUBIC
|
||||
|
||||
def __init__(self, **kwargs: Unpack[DeepseekVLHybridFastImageProcessorKwargs]):
|
||||
if kwargs.get("image_mean", None) is None:
|
||||
background_color = (127, 127, 127)
|
||||
else:
|
||||
background_color = tuple([int(x * 255) for x in kwargs.get("image_mean")])
|
||||
if kwargs.get("high_res_image_mean", None) is None:
|
||||
high_res_background_color = (127, 127, 127)
|
||||
else:
|
||||
high_res_background_color = tuple([int(x * 255) for x in kwargs.get("high_res_image_mean")])
|
||||
DeepseekVLImageProcessorFast().__init__(**kwargs)
|
||||
self.background_color = tuple(background_color)
|
||||
self.high_res_background_color = tuple(high_res_background_color)
|
||||
|
||||
def _further_process_kwargs(
|
||||
self,
|
||||
size: Optional[SizeDict] = None,
|
||||
high_res_size: Optional[SizeDict] = None,
|
||||
default_to_square: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, list[float]]] = None,
|
||||
image_std: Optional[Union[float, list[float]]] = None,
|
||||
high_res_image_mean: Optional[Union[float, list[float]]] = None,
|
||||
high_res_image_std: Optional[Union[float, list[float]]] = None,
|
||||
data_format: Optional[ChannelDimension] = None,
|
||||
**kwargs,
|
||||
) -> dict:
|
||||
"""
|
||||
Update kwargs that need further processing before being validated
|
||||
Can be overridden by subclasses to customize the processing of kwargs.
|
||||
"""
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
if size is not None:
|
||||
size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square))
|
||||
if high_res_size is not None:
|
||||
high_res_size = SizeDict(**get_size_dict(size=high_res_size, default_to_square=default_to_square))
|
||||
if isinstance(image_mean, list):
|
||||
image_mean = tuple(image_mean)
|
||||
if isinstance(image_std, list):
|
||||
image_std = tuple(image_std)
|
||||
if isinstance(high_res_image_mean, list):
|
||||
high_res_image_mean = tuple(high_res_image_mean)
|
||||
if isinstance(high_res_image_std, list):
|
||||
high_res_image_std = tuple(high_res_image_std)
|
||||
if data_format is None:
|
||||
data_format = ChannelDimension.FIRST
|
||||
|
||||
high_res_resample = kwargs.pop("high_res_resample")
|
||||
kwargs["high_res_interpolation"] = (
|
||||
pil_torch_interpolation_mapping[high_res_resample]
|
||||
if isinstance(high_res_resample, (int, PILImageResampling))
|
||||
else high_res_resample
|
||||
)
|
||||
|
||||
kwargs["size"] = size
|
||||
kwargs["high_res_size"] = high_res_size
|
||||
kwargs["default_to_square"] = default_to_square
|
||||
kwargs["image_mean"] = image_mean
|
||||
kwargs["image_std"] = image_std
|
||||
kwargs["high_res_image_mean"] = high_res_image_mean
|
||||
kwargs["high_res_image_std"] = high_res_image_std
|
||||
kwargs["data_format"] = data_format
|
||||
|
||||
return kwargs
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: list["torch.Tensor"],
|
||||
do_resize: bool,
|
||||
size: SizeDict,
|
||||
high_res_size: SizeDict,
|
||||
min_size: int,
|
||||
interpolation: Optional["F.InterpolationMode"],
|
||||
high_res_interpolation: Optional["F.InterpolationMode"],
|
||||
do_rescale: bool,
|
||||
rescale_factor: float,
|
||||
do_normalize: bool,
|
||||
image_mean: Optional[Union[float, list[float]]],
|
||||
image_std: Optional[Union[float, list[float]]],
|
||||
high_res_image_mean: Optional[Union[float, list[float]]],
|
||||
high_res_image_std: Optional[Union[float, list[float]]],
|
||||
disable_grouping: Optional[bool],
|
||||
return_tensors: Optional[Union[str, TensorType]],
|
||||
do_pad: bool = True,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
# Group images by size for batched resizing
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
|
||||
high_res_resized_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_resize:
|
||||
stacked_high_res_images = self.resize(
|
||||
image=stacked_images, size=high_res_size, min_size=min_size, interpolation=high_res_interpolation
|
||||
)
|
||||
high_res_resized_images_grouped[shape] = stacked_high_res_images
|
||||
high_res_resized_images = reorder_images(high_res_resized_images_grouped, grouped_images_index)
|
||||
|
||||
# Group images by size for further processing
|
||||
# Needed in case do_resize is False, or resize returns images with different sizes
|
||||
grouped_high_res_images, grouped_high_res_images_index = group_images_by_shape(
|
||||
high_res_resized_images, disable_grouping=disable_grouping
|
||||
)
|
||||
high_res_padded_images = {}
|
||||
high_res_processed_images_grouped = {}
|
||||
for shape, stacked_high_res_images in grouped_high_res_images.items():
|
||||
if do_pad:
|
||||
stacked_high_res_images = self.pad_to_square(
|
||||
stacked_high_res_images, background_color=self.high_res_background_color
|
||||
)
|
||||
high_res_padded_images[shape] = stacked_high_res_images
|
||||
# Fused rescale and normalize
|
||||
stacked_high_res_images = self.rescale_and_normalize(
|
||||
stacked_high_res_images,
|
||||
do_rescale,
|
||||
rescale_factor,
|
||||
do_normalize,
|
||||
high_res_image_mean,
|
||||
high_res_image_std,
|
||||
)
|
||||
high_res_processed_images_grouped[shape] = stacked_high_res_images
|
||||
high_res_processed_images = reorder_images(high_res_processed_images_grouped, grouped_high_res_images_index)
|
||||
high_res_processed_images = (
|
||||
torch.stack(high_res_processed_images, dim=0) if return_tensors else high_res_processed_images
|
||||
)
|
||||
|
||||
resized_images_grouped = {}
|
||||
for shape, stacked_high_res_padded_images in high_res_padded_images.items():
|
||||
if do_resize:
|
||||
stacked_images = self.resize(
|
||||
image=stacked_high_res_padded_images, size=size, min_size=min_size, interpolation=interpolation
|
||||
)
|
||||
resized_images_grouped[shape] = stacked_images
|
||||
resized_images = reorder_images(resized_images_grouped, grouped_high_res_images_index)
|
||||
|
||||
grouped_resized_images, grouped_resized_images_index = group_images_by_shape(
|
||||
resized_images, disable_grouping=disable_grouping
|
||||
)
|
||||
processed_images_grouped = {}
|
||||
for shape, stacked_images in grouped_resized_images.items():
|
||||
if do_pad:
|
||||
stacked_images = self.pad_to_square(stacked_images, background_color=self.background_color)
|
||||
# Fused rescale and normalize
|
||||
stacked_images = self.rescale_and_normalize(
|
||||
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
||||
)
|
||||
processed_images_grouped[shape] = stacked_images
|
||||
processed_images = reorder_images(processed_images_grouped, grouped_resized_images_index)
|
||||
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
||||
|
||||
return BatchFeature(
|
||||
data={"pixel_values": processed_images, "high_res_pixel_values": high_res_processed_images},
|
||||
tensor_type=return_tensors,
|
||||
)
|
||||
|
||||
|
||||
class DeepseekVLHybridProcessorKwargs(DeepseekVLProcessorKwargs):
|
||||
pass
|
||||
|
||||
@@ -773,5 +980,6 @@ __all__ = [
|
||||
"DeepseekVLHybridModel",
|
||||
"DeepseekVLHybridForConditionalGeneration",
|
||||
"DeepseekVLHybridImageProcessor",
|
||||
"DeepseekVLHybridImageProcessorFast",
|
||||
"DeepseekVLHybridProcessor",
|
||||
]
|
||||
|
||||
@@ -20,6 +20,7 @@ from ...utils.import_utils import define_import_structure
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_janus import *
|
||||
from .image_processing_janus import *
|
||||
from .image_processing_janus_fast import *
|
||||
from .modeling_janus import *
|
||||
from .processing_janus import *
|
||||
else:
|
||||
|
||||
@@ -134,6 +134,7 @@ class JanusImageProcessor(BaseImageProcessor):
|
||||
self,
|
||||
image: np.ndarray,
|
||||
size: Union[dict[str, int], int],
|
||||
background_color: Optional[tuple[int, int, int]] = None,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
@@ -145,6 +146,10 @@ class JanusImageProcessor(BaseImageProcessor):
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
Image to resize.
|
||||
size (`dict[str, int]` or `int`):
|
||||
The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
|
||||
background_color (`tuple[int, int, int]`):
|
||||
The background color to use for the padding.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
|
||||
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
|
||||
data_format (`ChannelDimension` or `str`, *optional*):
|
||||
@@ -163,6 +168,7 @@ class JanusImageProcessor(BaseImageProcessor):
|
||||
Returns:
|
||||
`np.ndarray`: The resized image.
|
||||
"""
|
||||
background_color = background_color if background_color is not None else self.background_color
|
||||
if input_data_format is None:
|
||||
input_data_format = infer_channel_dimension_format(image)
|
||||
|
||||
@@ -194,7 +200,7 @@ class JanusImageProcessor(BaseImageProcessor):
|
||||
# Expand and pad the images to obtain a square image of dimensions `size x size`
|
||||
image = self.pad_to_square(
|
||||
image=image,
|
||||
background_color=self.background_color,
|
||||
background_color=background_color,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
return image
|
||||
|
||||
245
src/transformers/models/janus/image_processing_janus_fast.py
Normal file
245
src/transformers/models/janus/image_processing_janus_fast.py
Normal file
@@ -0,0 +1,245 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
from ...image_utils import (
|
||||
OPENAI_CLIP_MEAN,
|
||||
OPENAI_CLIP_STD,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
SizeDict,
|
||||
)
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
auto_docstring,
|
||||
is_torch_available,
|
||||
is_torchvision_available,
|
||||
is_torchvision_v2_available,
|
||||
)
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
if is_torchvision_v2_available():
|
||||
from torchvision.transforms.v2 import functional as F
|
||||
elif is_torchvision_available():
|
||||
from torchvision.transforms import functional as F
|
||||
|
||||
|
||||
class JanusFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
r"""
|
||||
min_size (`int`, *optional*, defaults to 14):
|
||||
The minimum allowed size for the resized image. Ensures that neither the height nor width
|
||||
falls below this value after resizing.
|
||||
"""
|
||||
|
||||
min_size: int
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class JanusImageProcessorFast(BaseImageProcessorFast):
|
||||
resample = PILImageResampling.BICUBIC
|
||||
image_mean = OPENAI_CLIP_MEAN
|
||||
image_std = OPENAI_CLIP_STD
|
||||
size = {"height": 384, "width": 384}
|
||||
min_size = 14
|
||||
do_resize = True
|
||||
do_rescale = True
|
||||
do_normalize = True
|
||||
valid_kwargs = JanusFastImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[JanusFastImageProcessorKwargs]):
|
||||
if kwargs.get("image_mean", None) is None:
|
||||
background_color = (127, 127, 127)
|
||||
else:
|
||||
background_color = tuple([int(x * 255) for x in kwargs.get("image_mean")])
|
||||
super().__init__(**kwargs)
|
||||
self.background_color = tuple(background_color)
|
||||
|
||||
def resize(
|
||||
self,
|
||||
image: "torch.Tensor",
|
||||
size: SizeDict,
|
||||
min_size: int,
|
||||
interpolation: "F.InterpolationMode" = None,
|
||||
antialias: bool = True,
|
||||
**kwargs,
|
||||
) -> "torch.Tensor":
|
||||
if size.height is None or size.width is None or size.height != size.width:
|
||||
raise ValueError(
|
||||
f"Output height and width must be the same. Got height={size['height']} and width={size['width']}"
|
||||
)
|
||||
size = size.height
|
||||
|
||||
height, width = image.shape[-2:]
|
||||
max_size = max(height, width)
|
||||
|
||||
delta = size / max_size
|
||||
# Largest side becomes `size` and the other side is scaled according to the aspect ratio.
|
||||
output_size_nonpadded = SizeDict(
|
||||
height=max(int(height * delta), min_size),
|
||||
width=max(int(width * delta), min_size),
|
||||
)
|
||||
|
||||
return super().resize(image, size=output_size_nonpadded, interpolation=interpolation, antialias=antialias)
|
||||
|
||||
def pad_to_square(
|
||||
self,
|
||||
images: "torch.Tensor",
|
||||
background_color: Union[int, tuple[int, int, int]] = 0,
|
||||
) -> "torch.Tensor":
|
||||
"""
|
||||
Pads an image to a square based on the longest edge.
|
||||
|
||||
Args:
|
||||
images (`torch.Tensor`):
|
||||
The images to pad.
|
||||
background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
|
||||
The color to use for the padding. Can be an integer for single channel or a
|
||||
tuple of integers representing for multi-channel images. If passed as integer
|
||||
in mutli-channel mode, it will default to `0` in subsequent channels.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: The padded images.
|
||||
"""
|
||||
height, width = images.shape[-2:]
|
||||
num_channels = images.shape[1]
|
||||
batch_size = images.shape[0]
|
||||
|
||||
if height == width:
|
||||
return images
|
||||
|
||||
max_dim = max(height, width)
|
||||
|
||||
# Ensure background_color is the correct shape
|
||||
if isinstance(background_color, int):
|
||||
background_color = [background_color]
|
||||
elif len(background_color) != num_channels:
|
||||
raise ValueError(
|
||||
f"background_color must have no more than {num_channels} elements to match the number of channels"
|
||||
)
|
||||
|
||||
padded_images = torch.zeros(
|
||||
(batch_size, num_channels, max_dim, max_dim), dtype=images.dtype, device=images.device
|
||||
)
|
||||
for i, color in enumerate(background_color):
|
||||
padded_images[:, i, :, :] = color
|
||||
if width > height:
|
||||
start = (max_dim - height) // 2
|
||||
padded_images[:, :, start : start + height, :] = images
|
||||
else:
|
||||
start = (max_dim - width) // 2
|
||||
padded_images[:, :, :, start : start + width] = images
|
||||
|
||||
return padded_images
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: list["torch.Tensor"],
|
||||
do_resize: bool,
|
||||
size: SizeDict,
|
||||
min_size: int,
|
||||
interpolation: Optional["F.InterpolationMode"],
|
||||
do_rescale: bool,
|
||||
rescale_factor: float,
|
||||
do_normalize: bool,
|
||||
image_mean: Optional[Union[float, list[float]]],
|
||||
image_std: Optional[Union[float, list[float]]],
|
||||
disable_grouping: Optional[bool],
|
||||
return_tensors: Optional[Union[str, TensorType]],
|
||||
do_pad: bool = True,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
# Group images by size for batched resizing
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
|
||||
resized_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_resize:
|
||||
stacked_images = self.resize(
|
||||
image=stacked_images, size=size, min_size=min_size, interpolation=interpolation
|
||||
)
|
||||
resized_images_grouped[shape] = stacked_images
|
||||
resized_images = reorder_images(resized_images_grouped, grouped_images_index)
|
||||
|
||||
# Group images by size for further processing
|
||||
# Needed in case do_resize is False, or resize returns images with different sizes
|
||||
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
|
||||
processed_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_pad:
|
||||
stacked_images = self.pad_to_square(stacked_images, background_color=self.background_color)
|
||||
# Fused rescale and normalize
|
||||
stacked_images = self.rescale_and_normalize(
|
||||
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
||||
)
|
||||
processed_images_grouped[shape] = stacked_images
|
||||
|
||||
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
||||
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
||||
|
||||
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
||||
|
||||
def postprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
image_mean: Optional[list[float]] = None,
|
||||
image_std: Optional[list[float]] = None,
|
||||
return_tensors: Optional[str] = None,
|
||||
) -> "torch.Tensor":
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
rescale_factor = 1.0 / self.rescale_factor if rescale_factor is None else rescale_factor
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
image_mean = tuple(-rescale_factor * mean / std for mean, std in zip(image_mean, image_std))
|
||||
image_std = tuple(1 / std for std in image_std)
|
||||
|
||||
images = self.preprocess(
|
||||
images,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
do_resize=False,
|
||||
do_pad=False,
|
||||
return_tensors=return_tensors,
|
||||
).pixel_values
|
||||
if do_rescale:
|
||||
images = [image.clip(0, 255).to(torch.uint8) for image in images]
|
||||
|
||||
if do_normalize and do_rescale and return_tensors == "PIL.Image.Image":
|
||||
images = [F.to_pil_image(image) for image in images]
|
||||
|
||||
data = {"pixel_values": images}
|
||||
return_tensors = return_tensors if return_tensors != "PIL.Image.Image" else None
|
||||
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
|
||||
__all__ = ["JanusImageProcessorFast"]
|
||||
@@ -1437,6 +1437,7 @@ class JanusImageProcessor(BlipImageProcessor):
|
||||
self,
|
||||
image: np.ndarray,
|
||||
size: Union[dict[str, int], int],
|
||||
background_color: Optional[tuple[int, int, int]] = None,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
@@ -1448,6 +1449,10 @@ class JanusImageProcessor(BlipImageProcessor):
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
Image to resize.
|
||||
size (`dict[str, int]` or `int`):
|
||||
The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
|
||||
background_color (`tuple[int, int, int]`):
|
||||
The background color to use for the padding.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
|
||||
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
|
||||
data_format (`ChannelDimension` or `str`, *optional*):
|
||||
@@ -1466,6 +1471,7 @@ class JanusImageProcessor(BlipImageProcessor):
|
||||
Returns:
|
||||
`np.ndarray`: The resized image.
|
||||
"""
|
||||
background_color = background_color if background_color is not None else self.background_color
|
||||
if input_data_format is None:
|
||||
input_data_format = infer_channel_dimension_format(image)
|
||||
|
||||
@@ -1497,7 +1503,7 @@ class JanusImageProcessor(BlipImageProcessor):
|
||||
# Expand and pad the images to obtain a square image of dimensions `size x size`
|
||||
image = self.pad_to_square(
|
||||
image=image,
|
||||
background_color=self.background_color,
|
||||
background_color=background_color,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
return image
|
||||
|
||||
@@ -17,14 +17,21 @@
|
||||
import unittest
|
||||
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_vision_available
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import DeepseekVLImageProcessor
|
||||
|
||||
if is_torchvision_available():
|
||||
from transformers import DeepseekVLImageProcessorFast
|
||||
|
||||
|
||||
# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester with ViT->DeepseekVL
|
||||
class DeepseekVLImageProcessingTester:
|
||||
@@ -83,10 +90,9 @@ class DeepseekVLImageProcessingTester:
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTest with ViT->DeepseekVL
|
||||
class DeepseekVLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
# Ignore copy
|
||||
image_processing_class = DeepseekVLImageProcessor if is_vision_available() else None
|
||||
fast_image_processing_class = DeepseekVLImageProcessorFast if is_torchvision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
@@ -113,6 +119,33 @@ class DeepseekVLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
|
||||
image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42)
|
||||
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_slow_fast_equivalence_batched(self):
|
||||
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test")
|
||||
|
||||
if self.image_processing_class is None or self.fast_image_processing_class is None:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
|
||||
|
||||
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
|
||||
self.skipTest(
|
||||
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
|
||||
)
|
||||
|
||||
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
||||
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
encoding_slow = image_processor_slow(dummy_images, return_tensors=None)
|
||||
encoding_fast = image_processor_fast(dummy_images, return_tensors=None)
|
||||
|
||||
# Overwrite as the outputs are not always all of the same shape (kept for BC)
|
||||
for i in range(len(encoding_slow.pixel_values)):
|
||||
self._assert_slow_fast_tensors_equivalence(
|
||||
torch.from_numpy(encoding_slow.pixel_values[i]), encoding_fast.pixel_values[i]
|
||||
)
|
||||
|
||||
# Ignore copy
|
||||
@unittest.skip(reason="Not supported")
|
||||
def test_call_numpy_4_channels(self):
|
||||
|
||||
@@ -13,13 +13,13 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
|
||||
|
||||
@@ -32,6 +32,9 @@ if is_vision_available():
|
||||
|
||||
from transformers import DeepseekVLHybridImageProcessor
|
||||
|
||||
if is_torchvision_available():
|
||||
from transformers import DeepseekVLHybridImageProcessorFast
|
||||
|
||||
|
||||
class DeepseekVLHybridImageProcessingTester:
|
||||
def __init__(
|
||||
@@ -104,6 +107,7 @@ class DeepseekVLHybridImageProcessingTester:
|
||||
@require_vision
|
||||
class DeepseekVLHybridImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = DeepseekVLHybridImageProcessor if is_vision_available() else None
|
||||
fast_image_processing_class = DeepseekVLHybridImageProcessorFast if is_torchvision_available() else None
|
||||
|
||||
# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester.setUp with ViT->DeepseekVLHybrid
|
||||
def setUp(self):
|
||||
@@ -213,6 +217,59 @@ class DeepseekVLHybridImageProcessingTest(ImageProcessingTestMixin, unittest.Tes
|
||||
(self.image_processor_tester.batch_size, *expected_output_image_shape),
|
||||
)
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_slow_fast_equivalence(self):
|
||||
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test")
|
||||
|
||||
if self.image_processing_class is None or self.fast_image_processing_class is None:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
|
||||
|
||||
dummy_image = Image.open(
|
||||
requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw
|
||||
)
|
||||
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
|
||||
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
|
||||
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
|
||||
self._assert_slow_fast_tensors_equivalence(
|
||||
encoding_slow.high_res_pixel_values, encoding_fast.high_res_pixel_values
|
||||
)
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_slow_fast_equivalence_batched(self):
|
||||
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test")
|
||||
|
||||
if self.image_processing_class is None or self.fast_image_processing_class is None:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
|
||||
|
||||
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
|
||||
self.skipTest(
|
||||
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
|
||||
)
|
||||
|
||||
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
||||
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
encoding_slow = image_processor_slow(dummy_images, return_tensors=None)
|
||||
encoding_fast = image_processor_fast(dummy_images, return_tensors=None)
|
||||
|
||||
# Overwrite as the outputs are not always all of the same shape (kept for BC)
|
||||
for i in range(len(encoding_slow.pixel_values)):
|
||||
self._assert_slow_fast_tensors_equivalence(
|
||||
torch.from_numpy(encoding_slow.pixel_values[i]), encoding_fast.pixel_values[i]
|
||||
)
|
||||
for i in range(len(encoding_slow.high_res_pixel_values)):
|
||||
self._assert_slow_fast_tensors_equivalence(
|
||||
torch.from_numpy(encoding_slow.high_res_pixel_values[i]), encoding_fast.high_res_pixel_values[i]
|
||||
)
|
||||
|
||||
@unittest.skip(reason="Not supported")
|
||||
def test_call_numpy_4_channels(self):
|
||||
pass
|
||||
|
||||
@@ -18,7 +18,7 @@ import unittest
|
||||
import numpy as np
|
||||
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
|
||||
|
||||
@@ -31,6 +31,9 @@ if is_vision_available():
|
||||
|
||||
from transformers import JanusImageProcessor
|
||||
|
||||
if is_torchvision_available():
|
||||
from transformers import JanusImageProcessorFast
|
||||
|
||||
|
||||
class JanusImageProcessingTester:
|
||||
def __init__(
|
||||
@@ -44,8 +47,8 @@ class JanusImageProcessingTester:
|
||||
do_resize=True,
|
||||
size=None,
|
||||
do_normalize=True,
|
||||
image_mean=[1.0, 1.0, 1.0],
|
||||
image_std=[1.0, 1.0, 1.0],
|
||||
image_mean=[0.48145466, 0.4578275, 0.40821073],
|
||||
image_std=[0.26862954, 0.26130258, 0.27577711],
|
||||
do_convert_rgb=True,
|
||||
):
|
||||
size = size if size is not None else {"height": 384, "width": 384}
|
||||
@@ -89,6 +92,7 @@ class JanusImageProcessingTester:
|
||||
@require_vision
|
||||
class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = JanusImageProcessor if is_vision_available() else None
|
||||
fast_image_processing_class = JanusImageProcessorFast if is_torchvision_available() else None
|
||||
|
||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->Janus
|
||||
def setUp(self):
|
||||
@@ -101,7 +105,8 @@ class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
return self.image_processor_tester.prepare_image_processor_dict()
|
||||
|
||||
def test_image_processor_properties(self):
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
self.assertTrue(hasattr(image_processing, "do_resize"))
|
||||
self.assertTrue(hasattr(image_processing, "size"))
|
||||
self.assertTrue(hasattr(image_processing, "do_normalize"))
|
||||
@@ -110,18 +115,20 @@ class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
|
||||
|
||||
def test_image_processor_from_dict_with_kwargs(self):
|
||||
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor = image_processing_class.from_dict(self.image_processor_dict)
|
||||
self.assertEqual(image_processor.size, {"height": 384, "width": 384})
|
||||
self.assertEqual(image_processor.image_mean, [1.0, 1.0, 1.0])
|
||||
self.assertEqual(image_processor.image_mean, [0.48145466, 0.4578275, 0.40821073])
|
||||
|
||||
image_processor = self.image_processing_class.from_dict(
|
||||
image_processor = image_processing_class.from_dict(
|
||||
self.image_processor_dict, size=42, image_mean=[1.0, 2.0, 1.0]
|
||||
)
|
||||
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
|
||||
self.assertEqual(image_processor.image_mean, [1.0, 2.0, 1.0])
|
||||
|
||||
def test_call_pil(self):
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, Image.Image)
|
||||
@@ -137,7 +144,8 @@ class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
|
||||
|
||||
def test_call_numpy(self):
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, np.ndarray)
|
||||
@@ -151,7 +159,8 @@ class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
|
||||
|
||||
def test_call_pytorch(self):
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
|
||||
|
||||
for image in image_inputs:
|
||||
@@ -166,7 +175,8 @@ class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
|
||||
|
||||
def test_nested_input(self):
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
|
||||
|
||||
# Test batched as a list of images.
|
||||
@@ -183,6 +193,50 @@ class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
# Image processor should return same pixel values, independently of input format.
|
||||
self.assertTrue((encoded_images_nested == encoded_images).all())
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_slow_fast_equivalence_batched(self):
|
||||
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test")
|
||||
|
||||
if self.image_processing_class is None or self.fast_image_processing_class is None:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
|
||||
|
||||
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
|
||||
self.skipTest(
|
||||
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
|
||||
)
|
||||
|
||||
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
||||
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
encoding_slow = image_processor_slow(dummy_images, return_tensors=None)
|
||||
encoding_fast = image_processor_fast(dummy_images, return_tensors=None)
|
||||
|
||||
# Overwrite as the outputs are not always all of the same shape (kept for BC)
|
||||
for i in range(len(encoding_slow.pixel_values)):
|
||||
self._assert_slow_fast_tensors_equivalence(
|
||||
torch.from_numpy(encoding_slow.pixel_values[i]), encoding_fast.pixel_values[i]
|
||||
)
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_slow_fast_equivalence_postprocess(self):
|
||||
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
||||
dummy_images = [image / 255.0 for image in dummy_images]
|
||||
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
encoding_slow = image_processor_slow.postprocess(dummy_images, return_tensors=None)
|
||||
encoding_fast = image_processor_fast.postprocess(dummy_images, return_tensors=None)
|
||||
|
||||
# Overwrite as the outputs are not always all of the same shape (kept for BC)
|
||||
for i in range(len(encoding_slow.pixel_values)):
|
||||
self._assert_slow_fast_tensors_equivalence(
|
||||
torch.from_numpy(encoding_slow.pixel_values[i]).float(), encoding_fast.pixel_values[i].float()
|
||||
)
|
||||
|
||||
@unittest.skip(reason="Not supported")
|
||||
def test_call_numpy_4_channels(self):
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user