🚨[Fast Image Processor] Force Fast Image Processor for Qwen2_VL/2_5_VL + Refactor (#39591)
* init * Force qwen2VL image proc to fast * refactor qwen2 vl fast * fix copies * Update after PR review and update tests to use return_tensors="pt" * fix processor tests * add BC for min pixels/max pixels
This commit is contained in:
@@ -49,6 +49,9 @@ from .configuration_auto import (
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
FORCE_FAST_IMAGE_PROCESSOR = ["Qwen2VLImageProcessor"]
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# This significantly improves completion suggestion performance when
|
||||
# the transformers package is used with Microsoft's Pylance language server.
|
||||
@@ -514,6 +517,13 @@ class AutoImageProcessor:
|
||||
# if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor.
|
||||
if use_fast is None:
|
||||
use_fast = image_processor_type.endswith("Fast")
|
||||
if not use_fast and image_processor_type in FORCE_FAST_IMAGE_PROCESSOR and is_torchvision_available():
|
||||
use_fast = True
|
||||
logger.warning_once(
|
||||
f"The image processor of type `{image_processor_type}` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. "
|
||||
"This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. "
|
||||
"Note that this behavior will be extended to all models in a future release."
|
||||
)
|
||||
if not use_fast:
|
||||
logger.warning_once(
|
||||
"Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
|
||||
|
||||
@@ -67,7 +67,7 @@ class ColQwen2Processor(ColPaliProcessor):
|
||||
query_prefix (`str`, *optional*): A prefix to be used for the query.
|
||||
"""
|
||||
|
||||
image_processor_class = "Qwen2VLImageProcessor"
|
||||
image_processor_class = "AutoImageProcessor"
|
||||
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
|
||||
|
||||
def __init__(
|
||||
|
||||
@@ -66,7 +66,7 @@ class ColQwen2Processor(ProcessorMixin):
|
||||
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
|
||||
image_processor_class = "Qwen2VLImageProcessor"
|
||||
image_processor_class = "AutoImageProcessor"
|
||||
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
|
||||
|
||||
def __init__(
|
||||
|
||||
@@ -138,6 +138,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
|
||||
processed_images_grouped = {}
|
||||
processed_grids = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
resized_height, resized_width = stacked_images.shape[-2:]
|
||||
# Fused rescale and normalize
|
||||
stacked_images = self.rescale_and_normalize(
|
||||
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
||||
@@ -188,9 +189,6 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
|
||||
images: ImageInput,
|
||||
**kwargs: Unpack[Glm4vFastImageProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Preprocess an image or batch of images.
|
||||
"""
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
|
||||
|
||||
@@ -35,9 +35,6 @@ from ...image_utils import (
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
SizeDict,
|
||||
get_image_size,
|
||||
make_flat_list_of_images,
|
||||
valid_images,
|
||||
)
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import (
|
||||
@@ -57,8 +54,6 @@ if is_torch_available():
|
||||
|
||||
|
||||
if is_torchvision_available():
|
||||
from ...image_utils import pil_torch_interpolation_mapping
|
||||
|
||||
if is_torchvision_v2_available():
|
||||
from torchvision.transforms.v2 import functional as F
|
||||
else:
|
||||
@@ -110,18 +105,90 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
|
||||
size = kwargs.pop("size", None)
|
||||
min_pixels = kwargs.pop("min_pixels", None)
|
||||
max_pixels = kwargs.pop("max_pixels", None)
|
||||
if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
|
||||
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
||||
else:
|
||||
size = self.size
|
||||
# backward compatibility: override size with min_pixels and max_pixels if they are provided
|
||||
size = self.size if size is None else size
|
||||
if min_pixels is not None:
|
||||
size["shortest_edge"] = min_pixels
|
||||
size.pop("min_pixels", None)
|
||||
if max_pixels is not None:
|
||||
size["longest_edge"] = max_pixels
|
||||
size.pop("max_pixels", None)
|
||||
if "shortest_edge" not in size or "longest_edge" not in size:
|
||||
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
||||
|
||||
super().__init__(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs)
|
||||
|
||||
def _further_process_kwargs(
|
||||
self,
|
||||
size: Optional[SizeDict] = None,
|
||||
min_pixels: Optional[int] = None,
|
||||
max_pixels: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> dict:
|
||||
"""
|
||||
Update kwargs that need further processing before being validated
|
||||
Can be overridden by subclasses to customize the processing of kwargs.
|
||||
"""
|
||||
if min_pixels is not None and max_pixels is not None:
|
||||
size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
|
||||
elif size is not None:
|
||||
if "shortest_edge" not in size or "longest_edge" not in size:
|
||||
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
||||
min_pixels = size["shortest_edge"]
|
||||
max_pixels = size["longest_edge"]
|
||||
else:
|
||||
size = {**self.size}
|
||||
|
||||
return super()._further_process_kwargs(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs)
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
videos: Optional[VideoInput] = None,
|
||||
**kwargs: Unpack[Qwen2VLFastImageProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
return super().preprocess(images, videos, **kwargs)
|
||||
|
||||
def _preprocess_image_like_inputs(
|
||||
self,
|
||||
images: ImageInput,
|
||||
videos: VideoInput,
|
||||
do_convert_rgb: bool,
|
||||
input_data_format: ChannelDimension,
|
||||
device: Optional[Union[str, "torch.device"]] = None,
|
||||
**kwargs: Unpack[DefaultFastImageProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Preprocess image-like inputs.
|
||||
To be overriden by subclasses when image-like inputs other than images should be processed.
|
||||
It can be used for segmentation maps, depth maps, etc.
|
||||
"""
|
||||
# Prepare input images
|
||||
batch_feature = BatchFeature()
|
||||
if images is not None:
|
||||
images = self._prepare_image_like_inputs(
|
||||
images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
|
||||
)
|
||||
batch_feature = self._preprocess(images, **kwargs)
|
||||
if videos is not None:
|
||||
logger.warning(
|
||||
"`Qwen2VLImageProcessorFast` works only with image inputs and doesn't process videos anymore. "
|
||||
"This is a deprecated behavior and will be removed in v5.0. "
|
||||
"Your videos should be forwarded to `Qwen2VLVideoProcessor`. "
|
||||
)
|
||||
# Can't change _prepare_images_structure to work with videos because it also needs to work with images.
|
||||
videos = make_batched_videos(videos)
|
||||
videos = [
|
||||
torch.stack(self._prepare_image_like_inputs(video, do_convert_rgb, input_data_format, device))
|
||||
for video in videos
|
||||
]
|
||||
video_outputs = self._preprocess(videos, **kwargs)
|
||||
batch_feature.update(
|
||||
{"pixel_values_videos": video_outputs.pixel_values, "video_grid_thw": video_outputs.image_grid_thw}
|
||||
)
|
||||
return batch_feature
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: list["torch.Tensor"],
|
||||
@@ -136,65 +203,15 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
|
||||
patch_size: int,
|
||||
temporal_patch_size: int,
|
||||
merge_size: int,
|
||||
do_convert_rgb: bool,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]],
|
||||
device: Optional[Union[str, torch.device]],
|
||||
disable_grouping: Optional[bool],
|
||||
return_tensors: Optional[Union[str, TensorType]],
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
|
||||
|
||||
Args:
|
||||
images (`ImageInput`):
|
||||
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
|
||||
vision_info (`list[Dict]`, *optional*):
|
||||
Optional list of dictionaries containing additional information about vision inputs.
|
||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||
Whether to resize the image.
|
||||
size (`dict[str, int]`, *optional*, defaults to `self.size`):
|
||||
Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
|
||||
interpolation (`InterpolationMode`):
|
||||
Resampling filter to use if resizing the image.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image.
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Scale factor to use if rescaling the image.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
|
||||
Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
||||
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
|
||||
Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
||||
patch_size (`int`, *optional*, defaults to `self.patch_size`):
|
||||
The spatial patch size of the vision encoder.
|
||||
temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
|
||||
The temporal patch size of the vision encoder.
|
||||
merge_size (`int`, *optional*, defaults to `self.merge_size`):
|
||||
The merge size of the vision encoder to llm encoder.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||
Whether to convert the image to RGB.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
device (`torch.device`, *optional*):
|
||||
The device to process the images on. If unset, the device is inferred from the input images.
|
||||
"""
|
||||
images = self._prepare_image_like_inputs(
|
||||
images=images,
|
||||
do_convert_rgb=do_convert_rgb,
|
||||
input_data_format=input_data_format,
|
||||
device=device,
|
||||
)
|
||||
|
||||
height, width = get_image_size(images[0], channel_dim=ChannelDimension.FIRST)
|
||||
resized_height, resized_width = height, width
|
||||
|
||||
# Group images by size for batched resizing
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
|
||||
resized_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
height, width = stacked_images.shape[-2:]
|
||||
if do_resize:
|
||||
resized_height, resized_width = smart_resize(
|
||||
height,
|
||||
@@ -215,203 +232,63 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
|
||||
# Needed in case do_resize is False, or resize returns images with different sizes
|
||||
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
|
||||
processed_images_grouped = {}
|
||||
processed_grids = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
resized_height, resized_width = stacked_images.shape[-2:]
|
||||
# Fused rescale and normalize
|
||||
stacked_images = self.rescale_and_normalize(
|
||||
patches = self.rescale_and_normalize(
|
||||
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
||||
)
|
||||
processed_images_grouped[shape] = stacked_images
|
||||
if patches.ndim == 4:
|
||||
# add a temporal dimension if we have images
|
||||
patches = patches.unsqueeze(1)
|
||||
if patches.shape[1] % temporal_patch_size != 0:
|
||||
repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1)
|
||||
patches = torch.cat([patches, repeats], dim=1)
|
||||
batch_size, grid_t, channel = patches.shape[:3]
|
||||
grid_t = grid_t // temporal_patch_size
|
||||
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
|
||||
|
||||
patches = patches.view(
|
||||
batch_size,
|
||||
grid_t,
|
||||
temporal_patch_size,
|
||||
channel,
|
||||
grid_h // merge_size,
|
||||
merge_size,
|
||||
patch_size,
|
||||
grid_w // merge_size,
|
||||
merge_size,
|
||||
patch_size,
|
||||
)
|
||||
# Reorder dimensions to group grid and patch information for subsequent flattening.
|
||||
# (batch, grid_t, grid_h, grid_w, merge_h, merge_w, channel, temp_patch_size, patch_h, patch_w)
|
||||
patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9)
|
||||
flatten_patches = patches.reshape(
|
||||
batch_size,
|
||||
grid_t * grid_h * grid_w,
|
||||
channel * temporal_patch_size * patch_size * patch_size,
|
||||
)
|
||||
|
||||
processed_images_grouped[shape] = flatten_patches
|
||||
processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size
|
||||
|
||||
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
||||
patches = torch.stack(processed_images, dim=0)
|
||||
if patches.shape[0] % temporal_patch_size != 0:
|
||||
repeats = patches[-1].unsqueeze(0).repeat(temporal_patch_size - 1, 1, 1, 1)
|
||||
patches = torch.cat([patches, repeats], dim=0)
|
||||
processed_grids = reorder_images(processed_grids, grouped_images_index)
|
||||
pixel_values = torch.cat(processed_images, dim=0)
|
||||
image_grid_thw = torch.tensor(processed_grids)
|
||||
|
||||
channel = patches.shape[1]
|
||||
grid_t = patches.shape[0] // temporal_patch_size
|
||||
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
|
||||
|
||||
patches = patches.view(
|
||||
grid_t,
|
||||
temporal_patch_size,
|
||||
channel,
|
||||
grid_h // merge_size,
|
||||
merge_size,
|
||||
patch_size,
|
||||
grid_w // merge_size,
|
||||
merge_size,
|
||||
patch_size,
|
||||
return BatchFeature(
|
||||
data={"pixel_values": pixel_values, "image_grid_thw": image_grid_thw}, tensor_type=return_tensors
|
||||
)
|
||||
patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8)
|
||||
flatten_patches = patches.reshape(
|
||||
grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size
|
||||
)
|
||||
|
||||
return flatten_patches, (grid_t, grid_h, grid_w)
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
videos: VideoInput = None,
|
||||
do_resize: Optional[bool] = None,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]] = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, list[float]]] = None,
|
||||
image_std: Optional[Union[float, list[float]]] = None,
|
||||
min_pixels: Optional[int] = None,
|
||||
max_pixels: Optional[int] = None,
|
||||
patch_size: Optional[int] = None,
|
||||
temporal_patch_size: Optional[int] = None,
|
||||
merge_size: Optional[int] = None,
|
||||
do_convert_rgb: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
device: Optional["torch.device"] = None,
|
||||
disable_grouping: Optional[bool] = False,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
min_pixels (`int`, *optional*, defaults to `56 * 56`):
|
||||
The min pixels of the image to resize the image.
|
||||
max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
|
||||
The max pixels of the image to resize the image.
|
||||
patch_size (`int`, *optional*, defaults to 14):
|
||||
The spatial patch size of the vision encoder.
|
||||
temporal_patch_size (`int`, *optional*, defaults to 2):
|
||||
The temporal patch size of the vision encoder.
|
||||
merge_size (`int`, *optional*, defaults to 2):
|
||||
The merge size of the vision encoder to llm encoder.
|
||||
"""
|
||||
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
|
||||
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
|
||||
|
||||
if size is not None:
|
||||
if "shortest_edge" not in size or "longest_edge" not in size:
|
||||
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
||||
min_pixels = size["shortest_edge"]
|
||||
elif min_pixels is not None and max_pixels is not None:
|
||||
# backward compatibility: override size with min_pixels and max_pixels if they are provided
|
||||
size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
|
||||
else:
|
||||
size = {**self.size}
|
||||
|
||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||
size = size if size is not None else self.size
|
||||
resample = resample if resample is not None else self.resample
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
patch_size = patch_size if patch_size is not None else self.patch_size
|
||||
temporal_patch_size = temporal_patch_size if temporal_patch_size is not None else self.temporal_patch_size
|
||||
merge_size = merge_size if merge_size is not None else self.merge_size
|
||||
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
||||
|
||||
# Make hashable for cache
|
||||
size = SizeDict(**size) if size is not None else None
|
||||
image_mean = tuple(image_mean) if image_mean is not None else None
|
||||
image_std = tuple(image_std) if image_std is not None else None
|
||||
|
||||
self._validate_preprocess_kwargs(
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
do_resize=do_resize,
|
||||
size=size,
|
||||
resample=resample,
|
||||
return_tensors=return_tensors,
|
||||
data_format=data_format,
|
||||
)
|
||||
interpolation = (
|
||||
pil_torch_interpolation_mapping[resample] if isinstance(resample, (PILImageResampling, int)) else resample
|
||||
)
|
||||
|
||||
if images is not None:
|
||||
images = make_flat_list_of_images(images)
|
||||
|
||||
if images is not None and not valid_images(images):
|
||||
raise ValueError(
|
||||
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
|
||||
"torch.Tensor, tf.Tensor or jax.ndarray."
|
||||
)
|
||||
|
||||
data = {}
|
||||
if images is not None:
|
||||
pixel_values, vision_grid_thws = [], []
|
||||
for image in images:
|
||||
patches, image_grid_thw = self._preprocess(
|
||||
image,
|
||||
do_resize=do_resize,
|
||||
size=size,
|
||||
interpolation=interpolation,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
patch_size=patch_size,
|
||||
temporal_patch_size=temporal_patch_size,
|
||||
merge_size=merge_size,
|
||||
do_convert_rgb=do_convert_rgb,
|
||||
input_data_format=input_data_format,
|
||||
device=device,
|
||||
disable_grouping=disable_grouping,
|
||||
)
|
||||
pixel_values.extend(patches)
|
||||
vision_grid_thws.append(image_grid_thw)
|
||||
pixel_values = torch.stack(pixel_values)
|
||||
vision_grid_thws = torch.tensor(vision_grid_thws)
|
||||
data.update({"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws})
|
||||
|
||||
# kept for BC only and should be removed after v5.0
|
||||
if videos is not None:
|
||||
logger.warning(
|
||||
"`Qwen2VLImageProcessorFast` works only with image inputs and doesn't process videos anymore. "
|
||||
"This is a deprecated behavior and will be removed in v5.0. "
|
||||
"Your videos should be forwarded to `Qwen2VLVideoProcessor`. "
|
||||
)
|
||||
videos = make_batched_videos(videos)
|
||||
pixel_values_videos, vision_grid_thws_videos = [], []
|
||||
for images in videos:
|
||||
patches, video_grid_thw = self._preprocess(
|
||||
images,
|
||||
do_resize=do_resize,
|
||||
size=size,
|
||||
interpolation=interpolation,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
patch_size=patch_size,
|
||||
temporal_patch_size=temporal_patch_size,
|
||||
merge_size=merge_size,
|
||||
do_convert_rgb=do_convert_rgb,
|
||||
input_data_format=input_data_format,
|
||||
device=device,
|
||||
disable_grouping=disable_grouping,
|
||||
)
|
||||
pixel_values_videos.extend(patches)
|
||||
vision_grid_thws_videos.append(video_grid_thw)
|
||||
pixel_values_videos = torch.stack(pixel_values_videos)
|
||||
vision_grid_thws_videos = torch.tensor(vision_grid_thws_videos)
|
||||
data.update({"pixel_values_videos": pixel_values_videos, "video_grid_thw": vision_grid_thws_videos})
|
||||
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
|
||||
"""
|
||||
A utility that returns number of image patches for a given image size.
|
||||
|
||||
Note: Do not remove this method! It is used by vLLM to infer the number of patches and placeholders
|
||||
without an image input.
|
||||
|
||||
Args:
|
||||
height (`int`):
|
||||
Height of the input image.
|
||||
|
||||
@@ -116,8 +116,21 @@ class Qwen2VLVideoProcessor(BaseVideoProcessor):
|
||||
model_input_names = ["pixel_values_videos", "video_grid_thw"]
|
||||
|
||||
def __init__(self, **kwargs: Unpack[Qwen2VLVideoProcessorInitKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
self.size = {"shortest_edge": self.min_pixels, "longest_edge": self.max_pixels}
|
||||
size = kwargs.pop("size", None)
|
||||
min_pixels = kwargs.pop("min_pixels", None)
|
||||
max_pixels = kwargs.pop("max_pixels", None)
|
||||
# backward compatibility: override size with min_pixels and max_pixels if they are provided
|
||||
size = self.size if size is None else size
|
||||
if min_pixels is not None:
|
||||
size["shortest_edge"] = min_pixels
|
||||
size.pop("min_pixels", None)
|
||||
if max_pixels is not None:
|
||||
size["longest_edge"] = max_pixels
|
||||
size.pop("max_pixels", None)
|
||||
if "shortest_edge" not in size or "longest_edge" not in size:
|
||||
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
||||
|
||||
super().__init__(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs)
|
||||
|
||||
def sample_frames(
|
||||
self,
|
||||
|
||||
@@ -25,10 +25,17 @@ from huggingface_hub import hf_hub_download
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
Qwen2_5OmniProcessor,
|
||||
Qwen2Tokenizer,
|
||||
Qwen2TokenizerFast,
|
||||
WhisperFeatureExtractor,
|
||||
)
|
||||
from transformers.testing_utils import require_av, require_librosa, require_torch, require_torchaudio, require_vision
|
||||
from transformers.testing_utils import (
|
||||
require_av,
|
||||
require_librosa,
|
||||
require_torch,
|
||||
require_torchaudio,
|
||||
require_torchvision,
|
||||
require_vision,
|
||||
)
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
@@ -38,12 +45,13 @@ if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import Qwen2VLImageProcessor
|
||||
from transformers import Qwen2VLImageProcessorFast
|
||||
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
@require_torchaudio
|
||||
@require_torchvision
|
||||
class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_class = Qwen2_5OmniProcessor
|
||||
|
||||
@@ -244,13 +252,13 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
)
|
||||
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
processor = Qwen2_5OmniProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
||||
processor = Qwen2_5OmniProcessor.from_pretrained(self.tmpdirname, use_fast=True)
|
||||
|
||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
||||
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
|
||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
||||
self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer)
|
||||
self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor)
|
||||
self.assertIsInstance(processor.tokenizer, Qwen2TokenizerFast)
|
||||
self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessorFast)
|
||||
self.assertIsInstance(processor.feature_extractor, WhisperFeatureExtractor)
|
||||
|
||||
def test_image_processor(self):
|
||||
@@ -267,8 +275,8 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
input_image_proc = image_processor(image_input, return_tensors="np")
|
||||
input_processor = processor(images=image_input, text="dummy", return_tensors="np")
|
||||
input_image_proc = image_processor(image_input, return_tensors="pt")
|
||||
input_processor = processor(images=image_input, text="dummy", return_tensors="pt")
|
||||
|
||||
for key in input_image_proc.keys():
|
||||
self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||
|
||||
@@ -20,15 +20,15 @@ import unittest
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from transformers import AutoProcessor, Qwen2Tokenizer
|
||||
from transformers.testing_utils import require_av, require_torch, require_vision
|
||||
from transformers import AutoProcessor, Qwen2TokenizerFast
|
||||
from transformers.testing_utils import require_av, require_torch, require_torchvision, require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import Qwen2_5_VLProcessor, Qwen2VLImageProcessor
|
||||
from transformers import Qwen2_5_VLProcessor, Qwen2VLImageProcessorFast
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
@@ -36,6 +36,7 @@ if is_torch_available():
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
@require_torchvision
|
||||
class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_class = Qwen2_5_VLProcessor
|
||||
|
||||
@@ -73,12 +74,12 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
|
||||
)
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
processor = Qwen2_5_VLProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
||||
processor = Qwen2_5_VLProcessor.from_pretrained(self.tmpdirname, use_fast=True)
|
||||
|
||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
||||
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
|
||||
self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer)
|
||||
self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor)
|
||||
self.assertIsInstance(processor.tokenizer, Qwen2TokenizerFast)
|
||||
self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessorFast)
|
||||
|
||||
def test_image_processor(self):
|
||||
image_processor = self.get_image_processor()
|
||||
@@ -91,8 +92,8 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
input_image_proc = image_processor(image_input, return_tensors="np")
|
||||
input_processor = processor(images=image_input, text="dummy", return_tensors="np")
|
||||
input_image_proc = image_processor(image_input, return_tensors="pt")
|
||||
input_processor = processor(images=image_input, text="dummy", return_tensors="pt")
|
||||
|
||||
for key in input_image_proc.keys():
|
||||
self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||
|
||||
@@ -22,7 +22,7 @@ import requests
|
||||
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
||||
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs
|
||||
|
||||
@@ -35,8 +35,8 @@ if is_vision_available():
|
||||
|
||||
from transformers import Qwen2VLImageProcessor
|
||||
|
||||
# if is_torchvision_available():
|
||||
# from transformers import Qwen2VLImageProcessorFast
|
||||
if is_torchvision_available():
|
||||
from transformers import Qwen2VLImageProcessorFast
|
||||
|
||||
|
||||
class Qwen2VLImageProcessingTester:
|
||||
@@ -119,7 +119,7 @@ class Qwen2VLImageProcessingTester:
|
||||
@require_vision
|
||||
class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = Qwen2VLImageProcessor if is_vision_available() else None
|
||||
# fast_image_processing_class = Qwen2VLImageProcessorFast if is_torchvision_available() else None
|
||||
fast_image_processing_class = Qwen2VLImageProcessorFast if is_torchvision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
@@ -363,3 +363,34 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
|
||||
|
||||
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
|
||||
self.assertEqual(encoding_slow.image_grid_thw.dtype, encoding_fast.image_grid_thw.dtype)
|
||||
self._assert_slow_fast_tensors_equivalence(
|
||||
encoding_slow.image_grid_thw.float(), encoding_fast.image_grid_thw.float()
|
||||
)
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_slow_fast_equivalence_batched(self):
|
||||
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test")
|
||||
|
||||
if self.image_processing_class is None or self.fast_image_processing_class is None:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
|
||||
|
||||
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
|
||||
self.skipTest(
|
||||
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
|
||||
)
|
||||
|
||||
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
||||
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
|
||||
encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
|
||||
|
||||
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
|
||||
self.assertEqual(encoding_slow.image_grid_thw.dtype, encoding_fast.image_grid_thw.dtype)
|
||||
self._assert_slow_fast_tensors_equivalence(
|
||||
encoding_slow.image_grid_thw.float(), encoding_fast.image_grid_thw.float()
|
||||
)
|
||||
|
||||
@@ -20,18 +20,18 @@ import unittest
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from transformers import AutoProcessor, Qwen2Tokenizer
|
||||
from transformers.testing_utils import require_av, require_torch, require_vision
|
||||
from transformers import AutoProcessor, Qwen2TokenizerFast
|
||||
from transformers.testing_utils import require_av, require_torch, require_torchvision, require_vision
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import Qwen2VLImageProcessor, Qwen2VLProcessor
|
||||
from transformers import Qwen2VLProcessor
|
||||
|
||||
if is_torchvision_available():
|
||||
from transformers import Qwen2VLVideoProcessor
|
||||
from transformers import Qwen2VLImageProcessorFast, Qwen2VLVideoProcessor
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
@@ -39,6 +39,7 @@ if is_torch_available():
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
@require_torchvision
|
||||
class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_class = Qwen2VLProcessor
|
||||
|
||||
@@ -76,12 +77,12 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
|
||||
)
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
processor = Qwen2VLProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
||||
processor = Qwen2VLProcessor.from_pretrained(self.tmpdirname, use_fast=True)
|
||||
|
||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
||||
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
|
||||
self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer)
|
||||
self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor)
|
||||
self.assertIsInstance(processor.tokenizer, Qwen2TokenizerFast)
|
||||
self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessorFast)
|
||||
self.assertIsInstance(processor.video_processor, Qwen2VLVideoProcessor)
|
||||
|
||||
def test_image_processor(self):
|
||||
@@ -95,8 +96,8 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
input_image_proc = image_processor(image_input, return_tensors="np")
|
||||
input_processor = processor(images=image_input, text="dummy", return_tensors="np")
|
||||
input_image_proc = image_processor(image_input, return_tensors="pt")
|
||||
input_processor = processor(images=image_input, text="dummy", return_tensors="pt")
|
||||
|
||||
for key in input_image_proc.keys():
|
||||
self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||
|
||||
@@ -937,7 +937,7 @@ class ProcessorTesterMixin:
|
||||
"video", batch_size, return_tensors, "videos_input_name", "video_processor", MODALITY_INPUT_DATA["videos"]
|
||||
)
|
||||
|
||||
@parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")])
|
||||
@parameterized.expand([(1, "pt"), (2, "pt")]) # fast image processors supports only torchvision
|
||||
def test_apply_chat_template_image(self, batch_size: int, return_tensors: str):
|
||||
self._test_apply_chat_template(
|
||||
"image", batch_size, return_tensors, "images_input_name", "image_processor", MODALITY_INPUT_DATA["images"]
|
||||
|
||||
Reference in New Issue
Block a user