Add fast image processor Janus, Deepseek VL, Deepseek VL hybrid (#39739)

* add fast image processor Janus, deepseek_vl, deepseek_vl_hybrid

* fix after review
This commit is contained in:
Yoni Gozlan
2025-08-01 12:20:08 -04:00
committed by GitHub
parent 88ead3f518
commit 7b4d9843ba
19 changed files with 1268 additions and 97 deletions

View File

@@ -209,6 +209,10 @@ model = DeepseekVLForConditionalGeneration.from_pretrained(
[[autodoc]] DeepseekVLImageProcessor [[autodoc]] DeepseekVLImageProcessor
## DeepseekVLImageProcessorFast
[[autodoc]] DeepseekVLImageProcessorFast
## DeepseekVLModel ## DeepseekVLModel
[[autodoc]] DeepseekVLModel [[autodoc]] DeepseekVLModel

View File

@@ -208,6 +208,10 @@ model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
[[autodoc]] DeepseekVLHybridImageProcessor [[autodoc]] DeepseekVLHybridImageProcessor
## DeepseekVLHybridImageProcessorFast
[[autodoc]] DeepseekVLHybridImageProcessorFast
## DeepseekVLHybridModel ## DeepseekVLHybridModel
[[autodoc]] DeepseekVLHybridModel [[autodoc]] DeepseekVLHybridModel

View File

@@ -209,6 +209,10 @@ for i, image in enumerate(images['pixel_values']):
[[autodoc]] JanusImageProcessor [[autodoc]] JanusImageProcessor
## JanusImageProcessorFast
[[autodoc]] JanusImageProcessorFast
## JanusVisionModel ## JanusVisionModel
[[autodoc]] JanusVisionModel [[autodoc]] JanusVisionModel

View File

@@ -78,8 +78,8 @@ else:
("convnextv2", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), ("convnextv2", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
("cvt", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), ("cvt", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
("data2vec-vision", ("BeitImageProcessor", "BeitImageProcessorFast")), ("data2vec-vision", ("BeitImageProcessor", "BeitImageProcessorFast")),
("deepseek_vl", ("DeepseekVLImageProcessor")), ("deepseek_vl", ("DeepseekVLImageProcessor", "DeepseekVLImageProcessorFast")),
("deepseek_vl_hybrid", ("DeepseekVLHybridImageProcessor")), ("deepseek_vl_hybrid", ("DeepseekVLHybridImageProcessor", "DeepseekVLHybridImageProcessorFast")),
("deformable_detr", ("DeformableDetrImageProcessor", "DeformableDetrImageProcessorFast")), ("deformable_detr", ("DeformableDetrImageProcessor", "DeformableDetrImageProcessorFast")),
("deit", ("DeiTImageProcessor", "DeiTImageProcessorFast")), ("deit", ("DeiTImageProcessor", "DeiTImageProcessorFast")),
("depth_anything", ("DPTImageProcessor", "DPTImageProcessorFast")), ("depth_anything", ("DPTImageProcessor", "DPTImageProcessorFast")),
@@ -113,7 +113,7 @@ else:
("imagegpt", ("ImageGPTImageProcessor",)), ("imagegpt", ("ImageGPTImageProcessor",)),
("instructblip", ("BlipImageProcessor", "BlipImageProcessorFast")), ("instructblip", ("BlipImageProcessor", "BlipImageProcessorFast")),
("instructblipvideo", ("InstructBlipVideoImageProcessor",)), ("instructblipvideo", ("InstructBlipVideoImageProcessor",)),
("janus", ("JanusImageProcessor")), ("janus", ("JanusImageProcessor", "JanusImageProcessorFast")),
("kosmos-2", ("CLIPImageProcessor", "CLIPImageProcessorFast")), ("kosmos-2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
("layoutlmv2", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessorFast")), ("layoutlmv2", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessorFast")),
("layoutlmv3", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")), ("layoutlmv3", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")),

View File

@@ -20,7 +20,9 @@
from ...configuration_utils import PretrainedConfig from ...configuration_utils import PretrainedConfig
from ...utils import logging from ...utils import (
logging,
)
from ..auto import CONFIG_MAPPING, AutoConfig from ..auto import CONFIG_MAPPING, AutoConfig

View File

@@ -131,6 +131,7 @@ class DeepseekVLImageProcessor(BaseImageProcessor):
self, self,
image: np.ndarray, image: np.ndarray,
size: Union[dict[str, int], int], size: Union[dict[str, int], int],
background_color: Optional[tuple[int, int, int]] = None,
resample: PILImageResampling = PILImageResampling.BICUBIC, resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None, data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -142,6 +143,10 @@ class DeepseekVLImageProcessor(BaseImageProcessor):
Args: Args:
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`dict[str, int]` or `int`):
The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
background_color (`tuple[int, int, int]`):
The background color to use for the padding.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
data_format (`ChannelDimension` or `str`, *optional*): data_format (`ChannelDimension` or `str`, *optional*):
@@ -160,6 +165,7 @@ class DeepseekVLImageProcessor(BaseImageProcessor):
Returns: Returns:
`np.ndarray`: The resized image. `np.ndarray`: The resized image.
""" """
background_color = background_color if background_color is not None else self.background_color
if input_data_format is None: if input_data_format is None:
input_data_format = infer_channel_dimension_format(image) input_data_format = infer_channel_dimension_format(image)
@@ -191,7 +197,7 @@ class DeepseekVLImageProcessor(BaseImageProcessor):
# Expand and pad the images to obtain a square image of dimensions `size x size` # Expand and pad the images to obtain a square image of dimensions `size x size`
image = self.pad_to_square( image = self.pad_to_square(
image=image, image=image,
background_color=self.background_color, background_color=background_color,
input_data_format=input_data_format, input_data_format=input_data_format,
) )
return image return image
@@ -406,9 +412,5 @@ class DeepseekVLImageProcessor(BaseImageProcessor):
return result return result
def postprocess(self):
"""Applies post-processing to the decoded image tokens by reversing transformations applied during preprocessing."""
raise AttributeError("Not needed for DeepseekVL")
__all__ = ["DeepseekVLImageProcessor"] __all__ = ["DeepseekVLImageProcessor"]

View File

@@ -0,0 +1,199 @@
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py.
# Do NOT edit this file manually as any edits will be overwritten by the generation of
# the file from the modular. If any change should be done, please apply the change to the
# modular_deepseek_vl.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional, Union
import torch.nn.functional as F
from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
group_images_by_shape,
reorder_images,
)
from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict
from ...processing_utils import Unpack
from ...utils import (
TensorType,
auto_docstring,
is_torch_available,
)
if is_torch_available():
import torch
class DeepseekVLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
r"""
min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width
falls below this value after resizing.
"""
min_size: int
@auto_docstring
class DeepseekVLImageProcessorFast(BaseImageProcessorFast):
resample = PILImageResampling.BICUBIC
image_mean = OPENAI_CLIP_MEAN
image_std = OPENAI_CLIP_STD
size = {"height": 384, "width": 384}
min_size = 14
do_resize = True
do_rescale = True
do_normalize = True
valid_kwargs = DeepseekVLFastImageProcessorKwargs
def __init__(self, **kwargs: Unpack[DeepseekVLFastImageProcessorKwargs]):
super().__init__(**kwargs)
if kwargs.get("image_mean", None) is None:
background_color = (127, 127, 127)
else:
background_color = tuple([int(x * 255) for x in kwargs.get("image_mean")])
self.background_color = tuple(background_color)
def resize(
self,
image: "torch.Tensor",
size: SizeDict,
min_size: int,
interpolation: "F.InterpolationMode" = None,
antialias: bool = True,
**kwargs,
) -> "torch.Tensor":
if size.height is None or size.width is None or size.height != size.width:
raise ValueError(
f"Output height and width must be the same. Got height={size['height']} and width={size['width']}"
)
size = size.height
height, width = image.shape[-2:]
max_size = max(height, width)
delta = size / max_size
# Largest side becomes `size` and the other side is scaled according to the aspect ratio.
output_size_nonpadded = SizeDict(
height=max(int(height * delta), min_size),
width=max(int(width * delta), min_size),
)
return super().resize(image, size=output_size_nonpadded, interpolation=interpolation, antialias=antialias)
def pad_to_square(
self,
images: "torch.Tensor",
background_color: Union[int, tuple[int, int, int]] = 0,
) -> "torch.Tensor":
"""
Pads an image to a square based on the longest edge.
Args:
images (`torch.Tensor`):
The images to pad.
background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
The color to use for the padding. Can be an integer for single channel or a
tuple of integers representing for multi-channel images. If passed as integer
in mutli-channel mode, it will default to `0` in subsequent channels.
Returns:
`torch.Tensor`: The padded images.
"""
height, width = images.shape[-2:]
num_channels = images.shape[1]
batch_size = images.shape[0]
if height == width:
return images
max_dim = max(height, width)
# Ensure background_color is the correct shape
if isinstance(background_color, int):
background_color = [background_color]
elif len(background_color) != num_channels:
raise ValueError(
f"background_color must have no more than {num_channels} elements to match the number of channels"
)
padded_images = torch.zeros(
(batch_size, num_channels, max_dim, max_dim), dtype=images.dtype, device=images.device
)
for i, color in enumerate(background_color):
padded_images[:, i, :, :] = color
if width > height:
start = (max_dim - height) // 2
padded_images[:, :, start : start + height, :] = images
else:
start = (max_dim - width) // 2
padded_images[:, :, :, start : start + width] = images
return padded_images
def _preprocess(
self,
images: list["torch.Tensor"],
do_resize: bool,
size: SizeDict,
min_size: int,
interpolation: Optional["F.InterpolationMode"],
do_rescale: bool,
rescale_factor: float,
do_normalize: bool,
image_mean: Optional[Union[float, list[float]]],
image_std: Optional[Union[float, list[float]]],
disable_grouping: Optional[bool],
return_tensors: Optional[Union[str, TensorType]],
do_pad: bool = True,
**kwargs,
) -> BatchFeature:
# Group images by size for batched resizing
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
resized_images_grouped = {}
for shape, stacked_images in grouped_images.items():
if do_resize:
stacked_images = self.resize(
image=stacked_images, size=size, min_size=min_size, interpolation=interpolation
)
resized_images_grouped[shape] = stacked_images
resized_images = reorder_images(resized_images_grouped, grouped_images_index)
# Group images by size for further processing
# Needed in case do_resize is False, or resize returns images with different sizes
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
processed_images_grouped = {}
for shape, stacked_images in grouped_images.items():
if do_pad:
stacked_images = self.pad_to_square(stacked_images, background_color=self.background_color)
# Fused rescale and normalize
stacked_images = self.rescale_and_normalize(
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
)
processed_images_grouped[shape] = stacked_images
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
__all__ = ["DeepseekVLImageProcessorFast"]

View File

@@ -33,6 +33,7 @@ from ...utils import (
from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel
from ..idefics.modeling_idefics import IdeficsBaseModelOutputWithPast, IdeficsCausalLMOutputWithPast from ..idefics.modeling_idefics import IdeficsBaseModelOutputWithPast, IdeficsCausalLMOutputWithPast
from ..janus.image_processing_janus import JanusImageProcessor from ..janus.image_processing_janus import JanusImageProcessor
from ..janus.image_processing_janus_fast import JanusImageProcessorFast
from ..janus.modeling_janus import JanusForConditionalGeneration, JanusModel, JanusPreTrainedModel from ..janus.modeling_janus import JanusForConditionalGeneration, JanusModel, JanusPreTrainedModel
@@ -181,6 +182,9 @@ class DeepseekVLForConditionalGeneration(JanusForConditionalGeneration):
class DeepseekVLImageProcessor(JanusImageProcessor): class DeepseekVLImageProcessor(JanusImageProcessor):
def __init__(self, **super_kwargs):
super().__init__(**super_kwargs)
def postprocess(self): def postprocess(self):
raise AttributeError("Not needed for DeepseekVL") raise AttributeError("Not needed for DeepseekVL")
@@ -188,6 +192,14 @@ class DeepseekVLImageProcessor(JanusImageProcessor):
raise AttributeError("Not needed for DeepseekVL") raise AttributeError("Not needed for DeepseekVL")
class DeepseekVLImageProcessorFast(JanusImageProcessorFast):
def __init__(self, **super_kwargs):
super().__init__(**super_kwargs)
def postprocess(self):
raise AttributeError("Not needed for DeepseekVL")
class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False): class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False):
_defaults = { _defaults = {
"text_kwargs": {"padding": False}, "text_kwargs": {"padding": False},
@@ -322,5 +334,6 @@ __all__ = [
"DeepseekVLModel", "DeepseekVLModel",
"DeepseekVLForConditionalGeneration", "DeepseekVLForConditionalGeneration",
"DeepseekVLImageProcessor", "DeepseekVLImageProcessor",
"DeepseekVLImageProcessorFast",
"DeepseekVLProcessor", "DeepseekVLProcessor",
] ]

View File

@@ -21,6 +21,7 @@ if TYPE_CHECKING:
from .configuration_deepseek_vl_hybrid import * from .configuration_deepseek_vl_hybrid import *
from .image_processing_deepseek_vl_fast_hybrid import * from .image_processing_deepseek_vl_fast_hybrid import *
from .image_processing_deepseek_vl_hybrid import * from .image_processing_deepseek_vl_hybrid import *
from .image_processing_deepseek_vl_hybrid_fast import *
from .modeling_deepseek_vl_hybrid import * from .modeling_deepseek_vl_hybrid import *
from .processing_deepseek_vl_hybrid import * from .processing_deepseek_vl_hybrid import *
else: else:

View File

@@ -154,14 +154,15 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
self.background_color = tuple([int(x * 255) for x in image_mean]) self.background_color = tuple([int(x * 255) for x in image_mean])
if high_res_image_mean is None: if high_res_image_mean is None:
self.background_color = (127, 127, 127) self.high_res_background_color = (127, 127, 127)
else: else:
self.background_color = tuple([int(x * 255) for x in high_res_image_mean]) self.high_res_background_color = tuple([int(x * 255) for x in high_res_image_mean])
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
size: Union[dict[str, int], int], size: Union[dict[str, int], int],
background_color: Optional[tuple[int, int, int]] = None,
resample: PILImageResampling = PILImageResampling.BICUBIC, resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None, data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -173,6 +174,10 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
Args: Args:
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`dict[str, int]` or `int`):
The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
background_color (`tuple[int, int, int]`):
The background color to use for the padding.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
data_format (`ChannelDimension` or `str`, *optional*): data_format (`ChannelDimension` or `str`, *optional*):
@@ -191,6 +196,7 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
Returns: Returns:
`np.ndarray`: The resized image. `np.ndarray`: The resized image.
""" """
background_color = background_color if background_color is not None else self.background_color
if input_data_format is None: if input_data_format is None:
input_data_format = infer_channel_dimension_format(image) input_data_format = infer_channel_dimension_format(image)
@@ -222,7 +228,7 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
# Expand and pad the images to obtain a square image of dimensions `size x size` # Expand and pad the images to obtain a square image of dimensions `size x size`
image = self.pad_to_square( image = self.pad_to_square(
image=image, image=image,
background_color=self.background_color, background_color=background_color,
input_data_format=input_data_format, input_data_format=input_data_format,
) )
return image return image
@@ -361,16 +367,20 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
# high_res_image: resize (high) -> rescale -> normalize (high) # high_res_image: resize (high) -> rescale -> normalize (high)
# low_res_image: resize (high) -> rescale -> resize (low) -> normalize (low) # low_res_image: resize (high) -> rescale -> resize (low) -> normalize (low)
high_res_image = image high_res_image = image
if do_resize: if do_resize:
high_res_image = self.resize( high_res_image = self.resize(
image=high_res_image, image=high_res_image,
size=high_res_size_dict, size=high_res_size_dict,
background_color=self.high_res_background_color,
resample=high_res_resample, resample=high_res_resample,
input_data_format=input_data_format, input_data_format=input_data_format,
) )
image = self.resize( image = self.resize(
image=high_res_image, size=size_dict, resample=resample, input_data_format=input_data_format image=high_res_image,
size=size_dict,
background_color=self.background_color,
resample=resample,
input_data_format=input_data_format,
) )
if do_rescale: if do_rescale:
@@ -475,9 +485,5 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
return result return result
def postprocess(self):
"""Applies post-processing to the decoded image tokens by reversing transformations applied during preprocessing."""
raise AttributeError("Not needed for DeepseekVLHybrid")
__all__ = ["DeepseekVLHybridImageProcessor"] __all__ = ["DeepseekVLHybridImageProcessor"]

View File

@@ -0,0 +1,326 @@
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# This file was automatically generated from src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py.
# Do NOT edit this file manually as any edits will be overwritten by the generation of
# the file from the modular. If any change should be done, please apply the change to the
# modular_deepseek_vl_hybrid.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional, Union
import torch
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
BatchFeature,
DefaultFastImageProcessorKwargs,
get_size_dict,
group_images_by_shape,
reorder_images,
)
from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ChannelDimension, PILImageResampling, SizeDict
from ...processing_utils import Unpack
from ...utils import (
TensorType,
auto_docstring,
is_torchvision_available,
is_torchvision_v2_available,
)
if is_torchvision_v2_available():
from torchvision.transforms.v2 import functional as F
from ...image_utils import pil_torch_interpolation_mapping
elif is_torchvision_available():
from torchvision.transforms import functional as F
from ...image_utils import pil_torch_interpolation_mapping
class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
r"""
min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width
falls below this value after resizing.
high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
method.
high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
overridden by the `high_res_resample` parameter in the `preprocess` method.
high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
"""
min_size: int
high_res_size: dict
high_res_resample: "PILImageResampling"
high_res_image_mean: list[float]
high_res_image_std: list[float]
@auto_docstring
class DeepseekVLHybridImageProcessorFast(BaseImageProcessorFast):
resample = PILImageResampling.BICUBIC
image_mean = OPENAI_CLIP_MEAN
image_std = OPENAI_CLIP_STD
size = {"height": 384, "width": 384}
min_size = 14
do_resize = True
do_rescale = True
do_normalize = True
valid_kwargs = DeepseekVLHybridFastImageProcessorKwargs
high_res_image_mean = OPENAI_CLIP_MEAN
high_res_image_std = OPENAI_CLIP_STD
high_res_size = {"height": 1024, "width": 1024}
high_res_resample = PILImageResampling.BICUBIC
def __init__(self, **kwargs: Unpack[DeepseekVLHybridFastImageProcessorKwargs]):
if kwargs.get("image_mean", None) is None:
background_color = (127, 127, 127)
else:
background_color = tuple([int(x * 255) for x in kwargs.get("image_mean")])
if kwargs.get("high_res_image_mean", None) is None:
high_res_background_color = (127, 127, 127)
else:
high_res_background_color = tuple([int(x * 255) for x in kwargs.get("high_res_image_mean")])
super().__init__(**kwargs)
self.background_color = tuple(background_color)
self.high_res_background_color = tuple(high_res_background_color)
def resize(
self,
image: "torch.Tensor",
size: SizeDict,
min_size: int,
interpolation: "F.InterpolationMode" = None,
antialias: bool = True,
**kwargs,
) -> "torch.Tensor":
if size.height is None or size.width is None or size.height != size.width:
raise ValueError(
f"Output height and width must be the same. Got height={size['height']} and width={size['width']}"
)
size = size.height
height, width = image.shape[-2:]
max_size = max(height, width)
delta = size / max_size
# Largest side becomes `size` and the other side is scaled according to the aspect ratio.
output_size_nonpadded = SizeDict(
height=max(int(height * delta), min_size),
width=max(int(width * delta), min_size),
)
return super().resize(image, size=output_size_nonpadded, interpolation=interpolation, antialias=antialias)
def pad_to_square(
self,
images: "torch.Tensor",
background_color: Union[int, tuple[int, int, int]] = 0,
) -> "torch.Tensor":
"""
Pads an image to a square based on the longest edge.
Args:
images (`torch.Tensor`):
The images to pad.
background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
The color to use for the padding. Can be an integer for single channel or a
tuple of integers representing for multi-channel images. If passed as integer
in mutli-channel mode, it will default to `0` in subsequent channels.
Returns:
`torch.Tensor`: The padded images.
"""
height, width = images.shape[-2:]
num_channels = images.shape[1]
batch_size = images.shape[0]
if height == width:
return images
max_dim = max(height, width)
# Ensure background_color is the correct shape
if isinstance(background_color, int):
background_color = [background_color]
elif len(background_color) != num_channels:
raise ValueError(
f"background_color must have no more than {num_channels} elements to match the number of channels"
)
padded_images = torch.zeros(
(batch_size, num_channels, max_dim, max_dim), dtype=images.dtype, device=images.device
)
for i, color in enumerate(background_color):
padded_images[:, i, :, :] = color
if width > height:
start = (max_dim - height) // 2
padded_images[:, :, start : start + height, :] = images
else:
start = (max_dim - width) // 2
padded_images[:, :, :, start : start + width] = images
return padded_images
def _preprocess(
self,
images: list["torch.Tensor"],
do_resize: bool,
size: SizeDict,
high_res_size: SizeDict,
min_size: int,
interpolation: Optional["F.InterpolationMode"],
high_res_interpolation: Optional["F.InterpolationMode"],
do_rescale: bool,
rescale_factor: float,
do_normalize: bool,
image_mean: Optional[Union[float, list[float]]],
image_std: Optional[Union[float, list[float]]],
high_res_image_mean: Optional[Union[float, list[float]]],
high_res_image_std: Optional[Union[float, list[float]]],
disable_grouping: Optional[bool],
return_tensors: Optional[Union[str, TensorType]],
do_pad: bool = True,
**kwargs,
) -> BatchFeature:
# Group images by size for batched resizing
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
high_res_resized_images_grouped = {}
for shape, stacked_images in grouped_images.items():
if do_resize:
stacked_high_res_images = self.resize(
image=stacked_images, size=high_res_size, min_size=min_size, interpolation=high_res_interpolation
)
high_res_resized_images_grouped[shape] = stacked_high_res_images
high_res_resized_images = reorder_images(high_res_resized_images_grouped, grouped_images_index)
# Group images by size for further processing
# Needed in case do_resize is False, or resize returns images with different sizes
grouped_high_res_images, grouped_high_res_images_index = group_images_by_shape(
high_res_resized_images, disable_grouping=disable_grouping
)
high_res_padded_images = {}
high_res_processed_images_grouped = {}
for shape, stacked_high_res_images in grouped_high_res_images.items():
if do_pad:
stacked_high_res_images = self.pad_to_square(
stacked_high_res_images, background_color=self.high_res_background_color
)
high_res_padded_images[shape] = stacked_high_res_images
# Fused rescale and normalize
stacked_high_res_images = self.rescale_and_normalize(
stacked_high_res_images,
do_rescale,
rescale_factor,
do_normalize,
high_res_image_mean,
high_res_image_std,
)
high_res_processed_images_grouped[shape] = stacked_high_res_images
high_res_processed_images = reorder_images(high_res_processed_images_grouped, grouped_high_res_images_index)
high_res_processed_images = (
torch.stack(high_res_processed_images, dim=0) if return_tensors else high_res_processed_images
)
resized_images_grouped = {}
for shape, stacked_high_res_padded_images in high_res_padded_images.items():
if do_resize:
stacked_images = self.resize(
image=stacked_high_res_padded_images, size=size, min_size=min_size, interpolation=interpolation
)
resized_images_grouped[shape] = stacked_images
resized_images = reorder_images(resized_images_grouped, grouped_high_res_images_index)
grouped_resized_images, grouped_resized_images_index = group_images_by_shape(
resized_images, disable_grouping=disable_grouping
)
processed_images_grouped = {}
for shape, stacked_images in grouped_resized_images.items():
if do_pad:
stacked_images = self.pad_to_square(stacked_images, background_color=self.background_color)
# Fused rescale and normalize
stacked_images = self.rescale_and_normalize(
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
)
processed_images_grouped[shape] = stacked_images
processed_images = reorder_images(processed_images_grouped, grouped_resized_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
return BatchFeature(
data={"pixel_values": processed_images, "high_res_pixel_values": high_res_processed_images},
tensor_type=return_tensors,
)
def _further_process_kwargs(
self,
size: Optional[SizeDict] = None,
high_res_size: Optional[SizeDict] = None,
default_to_square: Optional[bool] = None,
image_mean: Optional[Union[float, list[float]]] = None,
image_std: Optional[Union[float, list[float]]] = None,
high_res_image_mean: Optional[Union[float, list[float]]] = None,
high_res_image_std: Optional[Union[float, list[float]]] = None,
data_format: Optional[ChannelDimension] = None,
**kwargs,
) -> dict:
"""
Update kwargs that need further processing before being validated
Can be overridden by subclasses to customize the processing of kwargs.
"""
if kwargs is None:
kwargs = {}
if size is not None:
size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square))
if high_res_size is not None:
high_res_size = SizeDict(**get_size_dict(size=high_res_size, default_to_square=default_to_square))
if isinstance(image_mean, list):
image_mean = tuple(image_mean)
if isinstance(image_std, list):
image_std = tuple(image_std)
if isinstance(high_res_image_mean, list):
high_res_image_mean = tuple(high_res_image_mean)
if isinstance(high_res_image_std, list):
high_res_image_std = tuple(high_res_image_std)
if data_format is None:
data_format = ChannelDimension.FIRST
high_res_resample = kwargs.pop("high_res_resample")
kwargs["high_res_interpolation"] = (
pil_torch_interpolation_mapping[high_res_resample]
if isinstance(high_res_resample, (int, PILImageResampling))
else high_res_resample
)
kwargs["size"] = size
kwargs["high_res_size"] = high_res_size
kwargs["default_to_square"] = default_to_square
kwargs["image_mean"] = image_mean
kwargs["image_std"] = image_std
kwargs["high_res_image_mean"] = high_res_image_mean
kwargs["high_res_image_std"] = high_res_image_std
kwargs["data_format"] = data_format
return kwargs
__all__ = ["DeepseekVLHybridImageProcessorFast"]

View File

@@ -20,7 +20,10 @@ import torch.nn as nn
from ...cache_utils import Cache from ...cache_utils import Cache
from ...image_processing_utils_fast import ( from ...image_processing_utils_fast import (
BatchFeature, BatchFeature,
DefaultFastImageProcessorKwargs,
get_size_dict, get_size_dict,
group_images_by_shape,
reorder_images,
) )
from ...image_transforms import convert_to_rgb, to_channel_dimension_format from ...image_transforms import convert_to_rgb, to_channel_dimension_format
from ...image_utils import ( from ...image_utils import (
@@ -29,6 +32,7 @@ from ...image_utils import (
ChannelDimension, ChannelDimension,
ImageInput, ImageInput,
PILImageResampling, PILImageResampling,
SizeDict,
infer_channel_dimension_format, infer_channel_dimension_format,
is_scaled_image, is_scaled_image,
make_flat_list_of_images, make_flat_list_of_images,
@@ -48,11 +52,14 @@ from ...utils import (
auto_docstring, auto_docstring,
can_return_tuple, can_return_tuple,
filter_out_non_signature_kwargs, filter_out_non_signature_kwargs,
is_torchvision_available,
is_torchvision_v2_available,
logging, logging,
) )
from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel
from ..deepseek_vl.configuration_deepseek_vl import DeepseekVLConfig from ..deepseek_vl.configuration_deepseek_vl import DeepseekVLConfig
from ..deepseek_vl.image_processing_deepseek_vl import DeepseekVLImageProcessor from ..deepseek_vl.image_processing_deepseek_vl import DeepseekVLImageProcessor
from ..deepseek_vl.image_processing_deepseek_vl_fast import DeepseekVLImageProcessorFast
from ..deepseek_vl.modeling_deepseek_vl import ( from ..deepseek_vl.modeling_deepseek_vl import (
DeepseekVLForConditionalGeneration, DeepseekVLForConditionalGeneration,
DeepseekVLModel, DeepseekVLModel,
@@ -63,6 +70,16 @@ from ..idefics.modeling_idefics import IdeficsBaseModelOutputWithPast, IdeficsCa
from ..sam.modeling_sam import SamLayerNorm, SamVisionNeck from ..sam.modeling_sam import SamLayerNorm, SamVisionNeck
if is_torchvision_v2_available():
from torchvision.transforms.v2 import functional as F
from ...image_utils import pil_torch_interpolation_mapping
elif is_torchvision_available():
from torchvision.transforms import functional as F
from ...image_utils import pil_torch_interpolation_mapping
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
@@ -516,9 +533,9 @@ class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
) )
if high_res_image_mean is None: if high_res_image_mean is None:
self.background_color = (127, 127, 127) self.high_res_background_color = (127, 127, 127)
else: else:
self.background_color = tuple([int(x * 255) for x in high_res_image_mean]) self.high_res_background_color = tuple([int(x * 255) for x in high_res_image_mean])
@filter_out_non_signature_kwargs() @filter_out_non_signature_kwargs()
def preprocess( def preprocess(
@@ -654,16 +671,20 @@ class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
# high_res_image: resize (high) -> rescale -> normalize (high) # high_res_image: resize (high) -> rescale -> normalize (high)
# low_res_image: resize (high) -> rescale -> resize (low) -> normalize (low) # low_res_image: resize (high) -> rescale -> resize (low) -> normalize (low)
high_res_image = image high_res_image = image
if do_resize: if do_resize:
high_res_image = self.resize( high_res_image = self.resize(
image=high_res_image, image=high_res_image,
size=high_res_size_dict, size=high_res_size_dict,
background_color=self.high_res_background_color,
resample=high_res_resample, resample=high_res_resample,
input_data_format=input_data_format, input_data_format=input_data_format,
) )
image = self.resize( image = self.resize(
image=high_res_image, size=size_dict, resample=resample, input_data_format=input_data_format image=high_res_image,
size=size_dict,
background_color=self.background_color,
resample=resample,
input_data_format=input_data_format,
) )
if do_rescale: if do_rescale:
@@ -695,6 +716,192 @@ class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
return BatchFeature(data=data, tensor_type=return_tensors) return BatchFeature(data=data, tensor_type=return_tensors)
class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
r"""
min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width
falls below this value after resizing.
high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
method.
high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
overridden by the `high_res_resample` parameter in the `preprocess` method.
high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
"""
min_size: int
high_res_size: dict
high_res_resample: "PILImageResampling"
high_res_image_mean: list[float]
high_res_image_std: list[float]
class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast):
high_res_image_mean = OPENAI_CLIP_MEAN
high_res_image_std = OPENAI_CLIP_STD
high_res_size = {"height": 1024, "width": 1024}
high_res_resample = PILImageResampling.BICUBIC
def __init__(self, **kwargs: Unpack[DeepseekVLHybridFastImageProcessorKwargs]):
if kwargs.get("image_mean", None) is None:
background_color = (127, 127, 127)
else:
background_color = tuple([int(x * 255) for x in kwargs.get("image_mean")])
if kwargs.get("high_res_image_mean", None) is None:
high_res_background_color = (127, 127, 127)
else:
high_res_background_color = tuple([int(x * 255) for x in kwargs.get("high_res_image_mean")])
DeepseekVLImageProcessorFast().__init__(**kwargs)
self.background_color = tuple(background_color)
self.high_res_background_color = tuple(high_res_background_color)
def _further_process_kwargs(
self,
size: Optional[SizeDict] = None,
high_res_size: Optional[SizeDict] = None,
default_to_square: Optional[bool] = None,
image_mean: Optional[Union[float, list[float]]] = None,
image_std: Optional[Union[float, list[float]]] = None,
high_res_image_mean: Optional[Union[float, list[float]]] = None,
high_res_image_std: Optional[Union[float, list[float]]] = None,
data_format: Optional[ChannelDimension] = None,
**kwargs,
) -> dict:
"""
Update kwargs that need further processing before being validated
Can be overridden by subclasses to customize the processing of kwargs.
"""
if kwargs is None:
kwargs = {}
if size is not None:
size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square))
if high_res_size is not None:
high_res_size = SizeDict(**get_size_dict(size=high_res_size, default_to_square=default_to_square))
if isinstance(image_mean, list):
image_mean = tuple(image_mean)
if isinstance(image_std, list):
image_std = tuple(image_std)
if isinstance(high_res_image_mean, list):
high_res_image_mean = tuple(high_res_image_mean)
if isinstance(high_res_image_std, list):
high_res_image_std = tuple(high_res_image_std)
if data_format is None:
data_format = ChannelDimension.FIRST
high_res_resample = kwargs.pop("high_res_resample")
kwargs["high_res_interpolation"] = (
pil_torch_interpolation_mapping[high_res_resample]
if isinstance(high_res_resample, (int, PILImageResampling))
else high_res_resample
)
kwargs["size"] = size
kwargs["high_res_size"] = high_res_size
kwargs["default_to_square"] = default_to_square
kwargs["image_mean"] = image_mean
kwargs["image_std"] = image_std
kwargs["high_res_image_mean"] = high_res_image_mean
kwargs["high_res_image_std"] = high_res_image_std
kwargs["data_format"] = data_format
return kwargs
def _preprocess(
self,
images: list["torch.Tensor"],
do_resize: bool,
size: SizeDict,
high_res_size: SizeDict,
min_size: int,
interpolation: Optional["F.InterpolationMode"],
high_res_interpolation: Optional["F.InterpolationMode"],
do_rescale: bool,
rescale_factor: float,
do_normalize: bool,
image_mean: Optional[Union[float, list[float]]],
image_std: Optional[Union[float, list[float]]],
high_res_image_mean: Optional[Union[float, list[float]]],
high_res_image_std: Optional[Union[float, list[float]]],
disable_grouping: Optional[bool],
return_tensors: Optional[Union[str, TensorType]],
do_pad: bool = True,
**kwargs,
) -> BatchFeature:
# Group images by size for batched resizing
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
high_res_resized_images_grouped = {}
for shape, stacked_images in grouped_images.items():
if do_resize:
stacked_high_res_images = self.resize(
image=stacked_images, size=high_res_size, min_size=min_size, interpolation=high_res_interpolation
)
high_res_resized_images_grouped[shape] = stacked_high_res_images
high_res_resized_images = reorder_images(high_res_resized_images_grouped, grouped_images_index)
# Group images by size for further processing
# Needed in case do_resize is False, or resize returns images with different sizes
grouped_high_res_images, grouped_high_res_images_index = group_images_by_shape(
high_res_resized_images, disable_grouping=disable_grouping
)
high_res_padded_images = {}
high_res_processed_images_grouped = {}
for shape, stacked_high_res_images in grouped_high_res_images.items():
if do_pad:
stacked_high_res_images = self.pad_to_square(
stacked_high_res_images, background_color=self.high_res_background_color
)
high_res_padded_images[shape] = stacked_high_res_images
# Fused rescale and normalize
stacked_high_res_images = self.rescale_and_normalize(
stacked_high_res_images,
do_rescale,
rescale_factor,
do_normalize,
high_res_image_mean,
high_res_image_std,
)
high_res_processed_images_grouped[shape] = stacked_high_res_images
high_res_processed_images = reorder_images(high_res_processed_images_grouped, grouped_high_res_images_index)
high_res_processed_images = (
torch.stack(high_res_processed_images, dim=0) if return_tensors else high_res_processed_images
)
resized_images_grouped = {}
for shape, stacked_high_res_padded_images in high_res_padded_images.items():
if do_resize:
stacked_images = self.resize(
image=stacked_high_res_padded_images, size=size, min_size=min_size, interpolation=interpolation
)
resized_images_grouped[shape] = stacked_images
resized_images = reorder_images(resized_images_grouped, grouped_high_res_images_index)
grouped_resized_images, grouped_resized_images_index = group_images_by_shape(
resized_images, disable_grouping=disable_grouping
)
processed_images_grouped = {}
for shape, stacked_images in grouped_resized_images.items():
if do_pad:
stacked_images = self.pad_to_square(stacked_images, background_color=self.background_color)
# Fused rescale and normalize
stacked_images = self.rescale_and_normalize(
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
)
processed_images_grouped[shape] = stacked_images
processed_images = reorder_images(processed_images_grouped, grouped_resized_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
return BatchFeature(
data={"pixel_values": processed_images, "high_res_pixel_values": high_res_processed_images},
tensor_type=return_tensors,
)
class DeepseekVLHybridProcessorKwargs(DeepseekVLProcessorKwargs): class DeepseekVLHybridProcessorKwargs(DeepseekVLProcessorKwargs):
pass pass
@@ -773,5 +980,6 @@ __all__ = [
"DeepseekVLHybridModel", "DeepseekVLHybridModel",
"DeepseekVLHybridForConditionalGeneration", "DeepseekVLHybridForConditionalGeneration",
"DeepseekVLHybridImageProcessor", "DeepseekVLHybridImageProcessor",
"DeepseekVLHybridImageProcessorFast",
"DeepseekVLHybridProcessor", "DeepseekVLHybridProcessor",
] ]

View File

@@ -20,6 +20,7 @@ from ...utils.import_utils import define_import_structure
if TYPE_CHECKING: if TYPE_CHECKING:
from .configuration_janus import * from .configuration_janus import *
from .image_processing_janus import * from .image_processing_janus import *
from .image_processing_janus_fast import *
from .modeling_janus import * from .modeling_janus import *
from .processing_janus import * from .processing_janus import *
else: else:

View File

@@ -134,6 +134,7 @@ class JanusImageProcessor(BaseImageProcessor):
self, self,
image: np.ndarray, image: np.ndarray,
size: Union[dict[str, int], int], size: Union[dict[str, int], int],
background_color: Optional[tuple[int, int, int]] = None,
resample: PILImageResampling = PILImageResampling.BICUBIC, resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None, data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -145,6 +146,10 @@ class JanusImageProcessor(BaseImageProcessor):
Args: Args:
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`dict[str, int]` or `int`):
The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
background_color (`tuple[int, int, int]`):
The background color to use for the padding.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
data_format (`ChannelDimension` or `str`, *optional*): data_format (`ChannelDimension` or `str`, *optional*):
@@ -163,6 +168,7 @@ class JanusImageProcessor(BaseImageProcessor):
Returns: Returns:
`np.ndarray`: The resized image. `np.ndarray`: The resized image.
""" """
background_color = background_color if background_color is not None else self.background_color
if input_data_format is None: if input_data_format is None:
input_data_format = infer_channel_dimension_format(image) input_data_format = infer_channel_dimension_format(image)
@@ -194,7 +200,7 @@ class JanusImageProcessor(BaseImageProcessor):
# Expand and pad the images to obtain a square image of dimensions `size x size` # Expand and pad the images to obtain a square image of dimensions `size x size`
image = self.pad_to_square( image = self.pad_to_square(
image=image, image=image,
background_color=self.background_color, background_color=background_color,
input_data_format=input_data_format, input_data_format=input_data_format,
) )
return image return image

View File

@@ -0,0 +1,245 @@
# coding=utf-8
# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional, Union
from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
group_images_by_shape,
reorder_images,
)
from ...image_utils import (
OPENAI_CLIP_MEAN,
OPENAI_CLIP_STD,
ImageInput,
PILImageResampling,
SizeDict,
)
from ...processing_utils import Unpack
from ...utils import (
TensorType,
auto_docstring,
is_torch_available,
is_torchvision_available,
is_torchvision_v2_available,
)
if is_torch_available():
import torch
if is_torchvision_v2_available():
from torchvision.transforms.v2 import functional as F
elif is_torchvision_available():
from torchvision.transforms import functional as F
class JanusFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
r"""
min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width
falls below this value after resizing.
"""
min_size: int
@auto_docstring
class JanusImageProcessorFast(BaseImageProcessorFast):
resample = PILImageResampling.BICUBIC
image_mean = OPENAI_CLIP_MEAN
image_std = OPENAI_CLIP_STD
size = {"height": 384, "width": 384}
min_size = 14
do_resize = True
do_rescale = True
do_normalize = True
valid_kwargs = JanusFastImageProcessorKwargs
def __init__(self, **kwargs: Unpack[JanusFastImageProcessorKwargs]):
if kwargs.get("image_mean", None) is None:
background_color = (127, 127, 127)
else:
background_color = tuple([int(x * 255) for x in kwargs.get("image_mean")])
super().__init__(**kwargs)
self.background_color = tuple(background_color)
def resize(
self,
image: "torch.Tensor",
size: SizeDict,
min_size: int,
interpolation: "F.InterpolationMode" = None,
antialias: bool = True,
**kwargs,
) -> "torch.Tensor":
if size.height is None or size.width is None or size.height != size.width:
raise ValueError(
f"Output height and width must be the same. Got height={size['height']} and width={size['width']}"
)
size = size.height
height, width = image.shape[-2:]
max_size = max(height, width)
delta = size / max_size
# Largest side becomes `size` and the other side is scaled according to the aspect ratio.
output_size_nonpadded = SizeDict(
height=max(int(height * delta), min_size),
width=max(int(width * delta), min_size),
)
return super().resize(image, size=output_size_nonpadded, interpolation=interpolation, antialias=antialias)
def pad_to_square(
self,
images: "torch.Tensor",
background_color: Union[int, tuple[int, int, int]] = 0,
) -> "torch.Tensor":
"""
Pads an image to a square based on the longest edge.
Args:
images (`torch.Tensor`):
The images to pad.
background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
The color to use for the padding. Can be an integer for single channel or a
tuple of integers representing for multi-channel images. If passed as integer
in mutli-channel mode, it will default to `0` in subsequent channels.
Returns:
`torch.Tensor`: The padded images.
"""
height, width = images.shape[-2:]
num_channels = images.shape[1]
batch_size = images.shape[0]
if height == width:
return images
max_dim = max(height, width)
# Ensure background_color is the correct shape
if isinstance(background_color, int):
background_color = [background_color]
elif len(background_color) != num_channels:
raise ValueError(
f"background_color must have no more than {num_channels} elements to match the number of channels"
)
padded_images = torch.zeros(
(batch_size, num_channels, max_dim, max_dim), dtype=images.dtype, device=images.device
)
for i, color in enumerate(background_color):
padded_images[:, i, :, :] = color
if width > height:
start = (max_dim - height) // 2
padded_images[:, :, start : start + height, :] = images
else:
start = (max_dim - width) // 2
padded_images[:, :, :, start : start + width] = images
return padded_images
def _preprocess(
self,
images: list["torch.Tensor"],
do_resize: bool,
size: SizeDict,
min_size: int,
interpolation: Optional["F.InterpolationMode"],
do_rescale: bool,
rescale_factor: float,
do_normalize: bool,
image_mean: Optional[Union[float, list[float]]],
image_std: Optional[Union[float, list[float]]],
disable_grouping: Optional[bool],
return_tensors: Optional[Union[str, TensorType]],
do_pad: bool = True,
**kwargs,
) -> BatchFeature:
# Group images by size for batched resizing
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
resized_images_grouped = {}
for shape, stacked_images in grouped_images.items():
if do_resize:
stacked_images = self.resize(
image=stacked_images, size=size, min_size=min_size, interpolation=interpolation
)
resized_images_grouped[shape] = stacked_images
resized_images = reorder_images(resized_images_grouped, grouped_images_index)
# Group images by size for further processing
# Needed in case do_resize is False, or resize returns images with different sizes
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
processed_images_grouped = {}
for shape, stacked_images in grouped_images.items():
if do_pad:
stacked_images = self.pad_to_square(stacked_images, background_color=self.background_color)
# Fused rescale and normalize
stacked_images = self.rescale_and_normalize(
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
)
processed_images_grouped[shape] = stacked_images
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
def postprocess(
self,
images: ImageInput,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[list[float]] = None,
image_std: Optional[list[float]] = None,
return_tensors: Optional[str] = None,
) -> "torch.Tensor":
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = 1.0 / self.rescale_factor if rescale_factor is None else rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
image_mean = tuple(-rescale_factor * mean / std for mean, std in zip(image_mean, image_std))
image_std = tuple(1 / std for std in image_std)
images = self.preprocess(
images,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
do_resize=False,
do_pad=False,
return_tensors=return_tensors,
).pixel_values
if do_rescale:
images = [image.clip(0, 255).to(torch.uint8) for image in images]
if do_normalize and do_rescale and return_tensors == "PIL.Image.Image":
images = [F.to_pil_image(image) for image in images]
data = {"pixel_values": images}
return_tensors = return_tensors if return_tensors != "PIL.Image.Image" else None
return BatchFeature(data=data, tensor_type=return_tensors)
__all__ = ["JanusImageProcessorFast"]

View File

@@ -1437,6 +1437,7 @@ class JanusImageProcessor(BlipImageProcessor):
self, self,
image: np.ndarray, image: np.ndarray,
size: Union[dict[str, int], int], size: Union[dict[str, int], int],
background_color: Optional[tuple[int, int, int]] = None,
resample: PILImageResampling = PILImageResampling.BICUBIC, resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None, data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -1448,6 +1449,10 @@ class JanusImageProcessor(BlipImageProcessor):
Args: Args:
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`dict[str, int]` or `int`):
The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
background_color (`tuple[int, int, int]`):
The background color to use for the padding.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
data_format (`ChannelDimension` or `str`, *optional*): data_format (`ChannelDimension` or `str`, *optional*):
@@ -1466,6 +1471,7 @@ class JanusImageProcessor(BlipImageProcessor):
Returns: Returns:
`np.ndarray`: The resized image. `np.ndarray`: The resized image.
""" """
background_color = background_color if background_color is not None else self.background_color
if input_data_format is None: if input_data_format is None:
input_data_format = infer_channel_dimension_format(image) input_data_format = infer_channel_dimension_format(image)
@@ -1497,7 +1503,7 @@ class JanusImageProcessor(BlipImageProcessor):
# Expand and pad the images to obtain a square image of dimensions `size x size` # Expand and pad the images to obtain a square image of dimensions `size x size`
image = self.pad_to_square( image = self.pad_to_square(
image=image, image=image,
background_color=self.background_color, background_color=background_color,
input_data_format=input_data_format, input_data_format=input_data_format,
) )
return image return image

View File

@@ -17,14 +17,21 @@
import unittest import unittest
from transformers.testing_utils import require_torch, require_vision from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_vision_available from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
if is_vision_available(): if is_vision_available():
from transformers import DeepseekVLImageProcessor from transformers import DeepseekVLImageProcessor
if is_torchvision_available():
from transformers import DeepseekVLImageProcessorFast
# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester with ViT->DeepseekVL # Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester with ViT->DeepseekVL
class DeepseekVLImageProcessingTester: class DeepseekVLImageProcessingTester:
@@ -83,10 +90,9 @@ class DeepseekVLImageProcessingTester:
@require_torch @require_torch
@require_vision @require_vision
# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTest with ViT->DeepseekVL
class DeepseekVLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): class DeepseekVLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
# Ignore copy
image_processing_class = DeepseekVLImageProcessor if is_vision_available() else None image_processing_class = DeepseekVLImageProcessor if is_vision_available() else None
fast_image_processing_class = DeepseekVLImageProcessorFast if is_torchvision_available() else None
def setUp(self): def setUp(self):
super().setUp() super().setUp()
@@ -113,6 +119,33 @@ class DeepseekVLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42) image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42)
self.assertEqual(image_processor.size, {"height": 42, "width": 42}) self.assertEqual(image_processor.size, {"height": 42, "width": 42})
@require_vision
@require_torch
def test_slow_fast_equivalence_batched(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")
if self.image_processing_class is None or self.fast_image_processing_class is None:
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
self.skipTest(
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
)
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
encoding_slow = image_processor_slow(dummy_images, return_tensors=None)
encoding_fast = image_processor_fast(dummy_images, return_tensors=None)
# Overwrite as the outputs are not always all of the same shape (kept for BC)
for i in range(len(encoding_slow.pixel_values)):
self._assert_slow_fast_tensors_equivalence(
torch.from_numpy(encoding_slow.pixel_values[i]), encoding_fast.pixel_values[i]
)
# Ignore copy # Ignore copy
@unittest.skip(reason="Not supported") @unittest.skip(reason="Not supported")
def test_call_numpy_4_channels(self): def test_call_numpy_4_channels(self):

View File

@@ -13,13 +13,13 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import unittest import unittest
import numpy as np import numpy as np
import requests
from transformers.testing_utils import require_torch, require_vision from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
@@ -32,6 +32,9 @@ if is_vision_available():
from transformers import DeepseekVLHybridImageProcessor from transformers import DeepseekVLHybridImageProcessor
if is_torchvision_available():
from transformers import DeepseekVLHybridImageProcessorFast
class DeepseekVLHybridImageProcessingTester: class DeepseekVLHybridImageProcessingTester:
def __init__( def __init__(
@@ -104,6 +107,7 @@ class DeepseekVLHybridImageProcessingTester:
@require_vision @require_vision
class DeepseekVLHybridImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): class DeepseekVLHybridImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = DeepseekVLHybridImageProcessor if is_vision_available() else None image_processing_class = DeepseekVLHybridImageProcessor if is_vision_available() else None
fast_image_processing_class = DeepseekVLHybridImageProcessorFast if is_torchvision_available() else None
# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester.setUp with ViT->DeepseekVLHybrid # Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester.setUp with ViT->DeepseekVLHybrid
def setUp(self): def setUp(self):
@@ -213,6 +217,59 @@ class DeepseekVLHybridImageProcessingTest(ImageProcessingTestMixin, unittest.Tes
(self.image_processor_tester.batch_size, *expected_output_image_shape), (self.image_processor_tester.batch_size, *expected_output_image_shape),
) )
@require_vision
@require_torch
def test_slow_fast_equivalence(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")
if self.image_processing_class is None or self.fast_image_processing_class is None:
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
dummy_image = Image.open(
requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw
)
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
self._assert_slow_fast_tensors_equivalence(
encoding_slow.high_res_pixel_values, encoding_fast.high_res_pixel_values
)
@require_vision
@require_torch
def test_slow_fast_equivalence_batched(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")
if self.image_processing_class is None or self.fast_image_processing_class is None:
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
self.skipTest(
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
)
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
encoding_slow = image_processor_slow(dummy_images, return_tensors=None)
encoding_fast = image_processor_fast(dummy_images, return_tensors=None)
# Overwrite as the outputs are not always all of the same shape (kept for BC)
for i in range(len(encoding_slow.pixel_values)):
self._assert_slow_fast_tensors_equivalence(
torch.from_numpy(encoding_slow.pixel_values[i]), encoding_fast.pixel_values[i]
)
for i in range(len(encoding_slow.high_res_pixel_values)):
self._assert_slow_fast_tensors_equivalence(
torch.from_numpy(encoding_slow.high_res_pixel_values[i]), encoding_fast.high_res_pixel_values[i]
)
@unittest.skip(reason="Not supported") @unittest.skip(reason="Not supported")
def test_call_numpy_4_channels(self): def test_call_numpy_4_channels(self):
pass pass

View File

@@ -18,7 +18,7 @@ import unittest
import numpy as np import numpy as np
from transformers.testing_utils import require_torch, require_vision from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
@@ -31,6 +31,9 @@ if is_vision_available():
from transformers import JanusImageProcessor from transformers import JanusImageProcessor
if is_torchvision_available():
from transformers import JanusImageProcessorFast
class JanusImageProcessingTester: class JanusImageProcessingTester:
def __init__( def __init__(
@@ -44,8 +47,8 @@ class JanusImageProcessingTester:
do_resize=True, do_resize=True,
size=None, size=None,
do_normalize=True, do_normalize=True,
image_mean=[1.0, 1.0, 1.0], image_mean=[0.48145466, 0.4578275, 0.40821073],
image_std=[1.0, 1.0, 1.0], image_std=[0.26862954, 0.26130258, 0.27577711],
do_convert_rgb=True, do_convert_rgb=True,
): ):
size = size if size is not None else {"height": 384, "width": 384} size = size if size is not None else {"height": 384, "width": 384}
@@ -89,6 +92,7 @@ class JanusImageProcessingTester:
@require_vision @require_vision
class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = JanusImageProcessor if is_vision_available() else None image_processing_class = JanusImageProcessor if is_vision_available() else None
fast_image_processing_class = JanusImageProcessorFast if is_torchvision_available() else None
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->Janus # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->Janus
def setUp(self): def setUp(self):
@@ -101,7 +105,8 @@ class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
return self.image_processor_tester.prepare_image_processor_dict() return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self): def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict) for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_resize")) self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size")) self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_normalize")) self.assertTrue(hasattr(image_processing, "do_normalize"))
@@ -110,18 +115,20 @@ class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
self.assertTrue(hasattr(image_processing, "do_convert_rgb")) self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
def test_image_processor_from_dict_with_kwargs(self): def test_image_processor_from_dict_with_kwargs(self):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict) for image_processing_class in self.image_processor_list:
image_processor = image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.size, {"height": 384, "width": 384}) self.assertEqual(image_processor.size, {"height": 384, "width": 384})
self.assertEqual(image_processor.image_mean, [1.0, 1.0, 1.0]) self.assertEqual(image_processor.image_mean, [0.48145466, 0.4578275, 0.40821073])
image_processor = self.image_processing_class.from_dict( image_processor = image_processing_class.from_dict(
self.image_processor_dict, size=42, image_mean=[1.0, 2.0, 1.0] self.image_processor_dict, size=42, image_mean=[1.0, 2.0, 1.0]
) )
self.assertEqual(image_processor.size, {"height": 42, "width": 42}) self.assertEqual(image_processor.size, {"height": 42, "width": 42})
self.assertEqual(image_processor.image_mean, [1.0, 2.0, 1.0]) self.assertEqual(image_processor.image_mean, [1.0, 2.0, 1.0])
def test_call_pil(self): def test_call_pil(self):
image_processing = self.image_processing_class(**self.image_processor_dict) for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True) image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
for image in image_inputs: for image in image_inputs:
self.assertIsInstance(image, Image.Image) self.assertIsInstance(image, Image.Image)
@@ -137,7 +144,8 @@ class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
def test_call_numpy(self): def test_call_numpy(self):
image_processing = self.image_processing_class(**self.image_processor_dict) for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True) image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
for image in image_inputs: for image in image_inputs:
self.assertIsInstance(image, np.ndarray) self.assertIsInstance(image, np.ndarray)
@@ -151,7 +159,8 @@ class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
def test_call_pytorch(self): def test_call_pytorch(self):
image_processing = self.image_processing_class(**self.image_processor_dict) for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True) image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
for image in image_inputs: for image in image_inputs:
@@ -166,7 +175,8 @@ class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
def test_nested_input(self): def test_nested_input(self):
image_processing = self.image_processing_class(**self.image_processor_dict) for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True) image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
# Test batched as a list of images. # Test batched as a list of images.
@@ -183,6 +193,50 @@ class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
# Image processor should return same pixel values, independently of input format. # Image processor should return same pixel values, independently of input format.
self.assertTrue((encoded_images_nested == encoded_images).all()) self.assertTrue((encoded_images_nested == encoded_images).all())
@require_vision
@require_torch
def test_slow_fast_equivalence_batched(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")
if self.image_processing_class is None or self.fast_image_processing_class is None:
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
self.skipTest(
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
)
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
encoding_slow = image_processor_slow(dummy_images, return_tensors=None)
encoding_fast = image_processor_fast(dummy_images, return_tensors=None)
# Overwrite as the outputs are not always all of the same shape (kept for BC)
for i in range(len(encoding_slow.pixel_values)):
self._assert_slow_fast_tensors_equivalence(
torch.from_numpy(encoding_slow.pixel_values[i]), encoding_fast.pixel_values[i]
)
@require_vision
@require_torch
def test_slow_fast_equivalence_postprocess(self):
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
dummy_images = [image / 255.0 for image in dummy_images]
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
encoding_slow = image_processor_slow.postprocess(dummy_images, return_tensors=None)
encoding_fast = image_processor_fast.postprocess(dummy_images, return_tensors=None)
# Overwrite as the outputs are not always all of the same shape (kept for BC)
for i in range(len(encoding_slow.pixel_values)):
self._assert_slow_fast_tensors_equivalence(
torch.from_numpy(encoding_slow.pixel_values[i]).float(), encoding_fast.pixel_values[i].float()
)
@unittest.skip(reason="Not supported") @unittest.skip(reason="Not supported")
def test_call_numpy_4_channels(self): def test_call_numpy_4_channels(self):
pass pass