🚨[Fast Image Processor] Force Fast Image Processor for Qwen2_VL/2_5_VL + Refactor (#39591)
* init * Force qwen2VL image proc to fast * refactor qwen2 vl fast * fix copies * Update after PR review and update tests to use return_tensors="pt" * fix processor tests * add BC for min pixels/max pixels
This commit is contained in:
@@ -49,6 +49,9 @@ from .configuration_auto import (
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
FORCE_FAST_IMAGE_PROCESSOR = ["Qwen2VLImageProcessor"]
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
# This significantly improves completion suggestion performance when
|
# This significantly improves completion suggestion performance when
|
||||||
# the transformers package is used with Microsoft's Pylance language server.
|
# the transformers package is used with Microsoft's Pylance language server.
|
||||||
@@ -514,6 +517,13 @@ class AutoImageProcessor:
|
|||||||
# if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor.
|
# if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor.
|
||||||
if use_fast is None:
|
if use_fast is None:
|
||||||
use_fast = image_processor_type.endswith("Fast")
|
use_fast = image_processor_type.endswith("Fast")
|
||||||
|
if not use_fast and image_processor_type in FORCE_FAST_IMAGE_PROCESSOR and is_torchvision_available():
|
||||||
|
use_fast = True
|
||||||
|
logger.warning_once(
|
||||||
|
f"The image processor of type `{image_processor_type}` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. "
|
||||||
|
"This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. "
|
||||||
|
"Note that this behavior will be extended to all models in a future release."
|
||||||
|
)
|
||||||
if not use_fast:
|
if not use_fast:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
|
"Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ class ColQwen2Processor(ColPaliProcessor):
|
|||||||
query_prefix (`str`, *optional*): A prefix to be used for the query.
|
query_prefix (`str`, *optional*): A prefix to be used for the query.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
image_processor_class = "Qwen2VLImageProcessor"
|
image_processor_class = "AutoImageProcessor"
|
||||||
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
|
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ class ColQwen2Processor(ProcessorMixin):
|
|||||||
|
|
||||||
attributes = ["image_processor", "tokenizer"]
|
attributes = ["image_processor", "tokenizer"]
|
||||||
|
|
||||||
image_processor_class = "Qwen2VLImageProcessor"
|
image_processor_class = "AutoImageProcessor"
|
||||||
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
|
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|||||||
@@ -138,6 +138,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
|
|||||||
processed_images_grouped = {}
|
processed_images_grouped = {}
|
||||||
processed_grids = {}
|
processed_grids = {}
|
||||||
for shape, stacked_images in grouped_images.items():
|
for shape, stacked_images in grouped_images.items():
|
||||||
|
resized_height, resized_width = stacked_images.shape[-2:]
|
||||||
# Fused rescale and normalize
|
# Fused rescale and normalize
|
||||||
stacked_images = self.rescale_and_normalize(
|
stacked_images = self.rescale_and_normalize(
|
||||||
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
||||||
@@ -188,9 +189,6 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
|
|||||||
images: ImageInput,
|
images: ImageInput,
|
||||||
**kwargs: Unpack[Glm4vFastImageProcessorKwargs],
|
**kwargs: Unpack[Glm4vFastImageProcessorKwargs],
|
||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
"""
|
|
||||||
Preprocess an image or batch of images.
|
|
||||||
"""
|
|
||||||
return super().preprocess(images, **kwargs)
|
return super().preprocess(images, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -35,9 +35,6 @@ from ...image_utils import (
|
|||||||
ImageInput,
|
ImageInput,
|
||||||
PILImageResampling,
|
PILImageResampling,
|
||||||
SizeDict,
|
SizeDict,
|
||||||
get_image_size,
|
|
||||||
make_flat_list_of_images,
|
|
||||||
valid_images,
|
|
||||||
)
|
)
|
||||||
from ...processing_utils import Unpack
|
from ...processing_utils import Unpack
|
||||||
from ...utils import (
|
from ...utils import (
|
||||||
@@ -57,8 +54,6 @@ if is_torch_available():
|
|||||||
|
|
||||||
|
|
||||||
if is_torchvision_available():
|
if is_torchvision_available():
|
||||||
from ...image_utils import pil_torch_interpolation_mapping
|
|
||||||
|
|
||||||
if is_torchvision_v2_available():
|
if is_torchvision_v2_available():
|
||||||
from torchvision.transforms.v2 import functional as F
|
from torchvision.transforms.v2 import functional as F
|
||||||
else:
|
else:
|
||||||
@@ -110,18 +105,90 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
|
|||||||
size = kwargs.pop("size", None)
|
size = kwargs.pop("size", None)
|
||||||
min_pixels = kwargs.pop("min_pixels", None)
|
min_pixels = kwargs.pop("min_pixels", None)
|
||||||
max_pixels = kwargs.pop("max_pixels", None)
|
max_pixels = kwargs.pop("max_pixels", None)
|
||||||
if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
|
|
||||||
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
|
||||||
else:
|
|
||||||
size = self.size
|
|
||||||
# backward compatibility: override size with min_pixels and max_pixels if they are provided
|
# backward compatibility: override size with min_pixels and max_pixels if they are provided
|
||||||
|
size = self.size if size is None else size
|
||||||
if min_pixels is not None:
|
if min_pixels is not None:
|
||||||
size["shortest_edge"] = min_pixels
|
size["shortest_edge"] = min_pixels
|
||||||
|
size.pop("min_pixels", None)
|
||||||
if max_pixels is not None:
|
if max_pixels is not None:
|
||||||
size["longest_edge"] = max_pixels
|
size["longest_edge"] = max_pixels
|
||||||
|
size.pop("max_pixels", None)
|
||||||
|
if "shortest_edge" not in size or "longest_edge" not in size:
|
||||||
|
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
||||||
|
|
||||||
super().__init__(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs)
|
super().__init__(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs)
|
||||||
|
|
||||||
|
def _further_process_kwargs(
|
||||||
|
self,
|
||||||
|
size: Optional[SizeDict] = None,
|
||||||
|
min_pixels: Optional[int] = None,
|
||||||
|
max_pixels: Optional[int] = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Update kwargs that need further processing before being validated
|
||||||
|
Can be overridden by subclasses to customize the processing of kwargs.
|
||||||
|
"""
|
||||||
|
if min_pixels is not None and max_pixels is not None:
|
||||||
|
size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
|
||||||
|
elif size is not None:
|
||||||
|
if "shortest_edge" not in size or "longest_edge" not in size:
|
||||||
|
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
||||||
|
min_pixels = size["shortest_edge"]
|
||||||
|
max_pixels = size["longest_edge"]
|
||||||
|
else:
|
||||||
|
size = {**self.size}
|
||||||
|
|
||||||
|
return super()._further_process_kwargs(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs)
|
||||||
|
|
||||||
|
@auto_docstring
|
||||||
|
def preprocess(
|
||||||
|
self,
|
||||||
|
images: ImageInput,
|
||||||
|
videos: Optional[VideoInput] = None,
|
||||||
|
**kwargs: Unpack[Qwen2VLFastImageProcessorKwargs],
|
||||||
|
) -> BatchFeature:
|
||||||
|
return super().preprocess(images, videos, **kwargs)
|
||||||
|
|
||||||
|
def _preprocess_image_like_inputs(
|
||||||
|
self,
|
||||||
|
images: ImageInput,
|
||||||
|
videos: VideoInput,
|
||||||
|
do_convert_rgb: bool,
|
||||||
|
input_data_format: ChannelDimension,
|
||||||
|
device: Optional[Union[str, "torch.device"]] = None,
|
||||||
|
**kwargs: Unpack[DefaultFastImageProcessorKwargs],
|
||||||
|
) -> BatchFeature:
|
||||||
|
"""
|
||||||
|
Preprocess image-like inputs.
|
||||||
|
To be overriden by subclasses when image-like inputs other than images should be processed.
|
||||||
|
It can be used for segmentation maps, depth maps, etc.
|
||||||
|
"""
|
||||||
|
# Prepare input images
|
||||||
|
batch_feature = BatchFeature()
|
||||||
|
if images is not None:
|
||||||
|
images = self._prepare_image_like_inputs(
|
||||||
|
images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
|
||||||
|
)
|
||||||
|
batch_feature = self._preprocess(images, **kwargs)
|
||||||
|
if videos is not None:
|
||||||
|
logger.warning(
|
||||||
|
"`Qwen2VLImageProcessorFast` works only with image inputs and doesn't process videos anymore. "
|
||||||
|
"This is a deprecated behavior and will be removed in v5.0. "
|
||||||
|
"Your videos should be forwarded to `Qwen2VLVideoProcessor`. "
|
||||||
|
)
|
||||||
|
# Can't change _prepare_images_structure to work with videos because it also needs to work with images.
|
||||||
|
videos = make_batched_videos(videos)
|
||||||
|
videos = [
|
||||||
|
torch.stack(self._prepare_image_like_inputs(video, do_convert_rgb, input_data_format, device))
|
||||||
|
for video in videos
|
||||||
|
]
|
||||||
|
video_outputs = self._preprocess(videos, **kwargs)
|
||||||
|
batch_feature.update(
|
||||||
|
{"pixel_values_videos": video_outputs.pixel_values, "video_grid_thw": video_outputs.image_grid_thw}
|
||||||
|
)
|
||||||
|
return batch_feature
|
||||||
|
|
||||||
def _preprocess(
|
def _preprocess(
|
||||||
self,
|
self,
|
||||||
images: list["torch.Tensor"],
|
images: list["torch.Tensor"],
|
||||||
@@ -136,65 +203,15 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
|
|||||||
patch_size: int,
|
patch_size: int,
|
||||||
temporal_patch_size: int,
|
temporal_patch_size: int,
|
||||||
merge_size: int,
|
merge_size: int,
|
||||||
do_convert_rgb: bool,
|
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]],
|
|
||||||
device: Optional[Union[str, torch.device]],
|
|
||||||
disable_grouping: Optional[bool],
|
disable_grouping: Optional[bool],
|
||||||
|
return_tensors: Optional[Union[str, TensorType]],
|
||||||
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""
|
|
||||||
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
images (`ImageInput`):
|
|
||||||
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
|
|
||||||
vision_info (`list[Dict]`, *optional*):
|
|
||||||
Optional list of dictionaries containing additional information about vision inputs.
|
|
||||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
|
||||||
Whether to resize the image.
|
|
||||||
size (`dict[str, int]`, *optional*, defaults to `self.size`):
|
|
||||||
Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
|
|
||||||
interpolation (`InterpolationMode`):
|
|
||||||
Resampling filter to use if resizing the image.
|
|
||||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
|
||||||
Whether to rescale the image.
|
|
||||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
|
||||||
Scale factor to use if rescaling the image.
|
|
||||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
|
||||||
Whether to normalize the image.
|
|
||||||
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
|
|
||||||
Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
|
||||||
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
|
|
||||||
Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
|
||||||
patch_size (`int`, *optional*, defaults to `self.patch_size`):
|
|
||||||
The spatial patch size of the vision encoder.
|
|
||||||
temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
|
|
||||||
The temporal patch size of the vision encoder.
|
|
||||||
merge_size (`int`, *optional*, defaults to `self.merge_size`):
|
|
||||||
The merge size of the vision encoder to llm encoder.
|
|
||||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
|
||||||
Whether to convert the image to RGB.
|
|
||||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
|
||||||
The channel dimension format for the input image. Can be one of:
|
|
||||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
|
||||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
|
||||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
|
||||||
device (`torch.device`, *optional*):
|
|
||||||
The device to process the images on. If unset, the device is inferred from the input images.
|
|
||||||
"""
|
|
||||||
images = self._prepare_image_like_inputs(
|
|
||||||
images=images,
|
|
||||||
do_convert_rgb=do_convert_rgb,
|
|
||||||
input_data_format=input_data_format,
|
|
||||||
device=device,
|
|
||||||
)
|
|
||||||
|
|
||||||
height, width = get_image_size(images[0], channel_dim=ChannelDimension.FIRST)
|
|
||||||
resized_height, resized_width = height, width
|
|
||||||
|
|
||||||
# Group images by size for batched resizing
|
# Group images by size for batched resizing
|
||||||
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
|
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
|
||||||
resized_images_grouped = {}
|
resized_images_grouped = {}
|
||||||
for shape, stacked_images in grouped_images.items():
|
for shape, stacked_images in grouped_images.items():
|
||||||
|
height, width = stacked_images.shape[-2:]
|
||||||
if do_resize:
|
if do_resize:
|
||||||
resized_height, resized_width = smart_resize(
|
resized_height, resized_width = smart_resize(
|
||||||
height,
|
height,
|
||||||
@@ -215,24 +232,25 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
|
|||||||
# Needed in case do_resize is False, or resize returns images with different sizes
|
# Needed in case do_resize is False, or resize returns images with different sizes
|
||||||
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
|
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
|
||||||
processed_images_grouped = {}
|
processed_images_grouped = {}
|
||||||
|
processed_grids = {}
|
||||||
for shape, stacked_images in grouped_images.items():
|
for shape, stacked_images in grouped_images.items():
|
||||||
|
resized_height, resized_width = stacked_images.shape[-2:]
|
||||||
# Fused rescale and normalize
|
# Fused rescale and normalize
|
||||||
stacked_images = self.rescale_and_normalize(
|
patches = self.rescale_and_normalize(
|
||||||
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
||||||
)
|
)
|
||||||
processed_images_grouped[shape] = stacked_images
|
if patches.ndim == 4:
|
||||||
|
# add a temporal dimension if we have images
|
||||||
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
patches = patches.unsqueeze(1)
|
||||||
patches = torch.stack(processed_images, dim=0)
|
if patches.shape[1] % temporal_patch_size != 0:
|
||||||
if patches.shape[0] % temporal_patch_size != 0:
|
repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1)
|
||||||
repeats = patches[-1].unsqueeze(0).repeat(temporal_patch_size - 1, 1, 1, 1)
|
patches = torch.cat([patches, repeats], dim=1)
|
||||||
patches = torch.cat([patches, repeats], dim=0)
|
batch_size, grid_t, channel = patches.shape[:3]
|
||||||
|
grid_t = grid_t // temporal_patch_size
|
||||||
channel = patches.shape[1]
|
|
||||||
grid_t = patches.shape[0] // temporal_patch_size
|
|
||||||
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
|
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
|
||||||
|
|
||||||
patches = patches.view(
|
patches = patches.view(
|
||||||
|
batch_size,
|
||||||
grid_t,
|
grid_t,
|
||||||
temporal_patch_size,
|
temporal_patch_size,
|
||||||
channel,
|
channel,
|
||||||
@@ -243,175 +261,34 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
|
|||||||
merge_size,
|
merge_size,
|
||||||
patch_size,
|
patch_size,
|
||||||
)
|
)
|
||||||
patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8)
|
# Reorder dimensions to group grid and patch information for subsequent flattening.
|
||||||
|
# (batch, grid_t, grid_h, grid_w, merge_h, merge_w, channel, temp_patch_size, patch_h, patch_w)
|
||||||
|
patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9)
|
||||||
flatten_patches = patches.reshape(
|
flatten_patches = patches.reshape(
|
||||||
grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size
|
batch_size,
|
||||||
|
grid_t * grid_h * grid_w,
|
||||||
|
channel * temporal_patch_size * patch_size * patch_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
return flatten_patches, (grid_t, grid_h, grid_w)
|
processed_images_grouped[shape] = flatten_patches
|
||||||
|
processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size
|
||||||
|
|
||||||
@auto_docstring
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
||||||
def preprocess(
|
processed_grids = reorder_images(processed_grids, grouped_images_index)
|
||||||
self,
|
pixel_values = torch.cat(processed_images, dim=0)
|
||||||
images: ImageInput,
|
image_grid_thw = torch.tensor(processed_grids)
|
||||||
videos: VideoInput = None,
|
|
||||||
do_resize: Optional[bool] = None,
|
|
||||||
size: Optional[dict[str, int]] = None,
|
|
||||||
resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]] = None,
|
|
||||||
do_rescale: Optional[bool] = None,
|
|
||||||
rescale_factor: Optional[float] = None,
|
|
||||||
do_normalize: Optional[bool] = None,
|
|
||||||
image_mean: Optional[Union[float, list[float]]] = None,
|
|
||||||
image_std: Optional[Union[float, list[float]]] = None,
|
|
||||||
min_pixels: Optional[int] = None,
|
|
||||||
max_pixels: Optional[int] = None,
|
|
||||||
patch_size: Optional[int] = None,
|
|
||||||
temporal_patch_size: Optional[int] = None,
|
|
||||||
merge_size: Optional[int] = None,
|
|
||||||
do_convert_rgb: Optional[bool] = None,
|
|
||||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
|
||||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
|
||||||
device: Optional["torch.device"] = None,
|
|
||||||
disable_grouping: Optional[bool] = False,
|
|
||||||
**kwargs,
|
|
||||||
):
|
|
||||||
r"""
|
|
||||||
min_pixels (`int`, *optional*, defaults to `56 * 56`):
|
|
||||||
The min pixels of the image to resize the image.
|
|
||||||
max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
|
|
||||||
The max pixels of the image to resize the image.
|
|
||||||
patch_size (`int`, *optional*, defaults to 14):
|
|
||||||
The spatial patch size of the vision encoder.
|
|
||||||
temporal_patch_size (`int`, *optional*, defaults to 2):
|
|
||||||
The temporal patch size of the vision encoder.
|
|
||||||
merge_size (`int`, *optional*, defaults to 2):
|
|
||||||
The merge size of the vision encoder to llm encoder.
|
|
||||||
"""
|
|
||||||
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
|
|
||||||
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
|
|
||||||
|
|
||||||
if size is not None:
|
return BatchFeature(
|
||||||
if "shortest_edge" not in size or "longest_edge" not in size:
|
data={"pixel_values": pixel_values, "image_grid_thw": image_grid_thw}, tensor_type=return_tensors
|
||||||
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
|
||||||
min_pixels = size["shortest_edge"]
|
|
||||||
elif min_pixels is not None and max_pixels is not None:
|
|
||||||
# backward compatibility: override size with min_pixels and max_pixels if they are provided
|
|
||||||
size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
|
|
||||||
else:
|
|
||||||
size = {**self.size}
|
|
||||||
|
|
||||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
|
||||||
size = size if size is not None else self.size
|
|
||||||
resample = resample if resample is not None else self.resample
|
|
||||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
|
||||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
|
||||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
|
||||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
|
||||||
image_std = image_std if image_std is not None else self.image_std
|
|
||||||
patch_size = patch_size if patch_size is not None else self.patch_size
|
|
||||||
temporal_patch_size = temporal_patch_size if temporal_patch_size is not None else self.temporal_patch_size
|
|
||||||
merge_size = merge_size if merge_size is not None else self.merge_size
|
|
||||||
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
|
||||||
|
|
||||||
# Make hashable for cache
|
|
||||||
size = SizeDict(**size) if size is not None else None
|
|
||||||
image_mean = tuple(image_mean) if image_mean is not None else None
|
|
||||||
image_std = tuple(image_std) if image_std is not None else None
|
|
||||||
|
|
||||||
self._validate_preprocess_kwargs(
|
|
||||||
do_rescale=do_rescale,
|
|
||||||
rescale_factor=rescale_factor,
|
|
||||||
do_normalize=do_normalize,
|
|
||||||
image_mean=image_mean,
|
|
||||||
image_std=image_std,
|
|
||||||
do_resize=do_resize,
|
|
||||||
size=size,
|
|
||||||
resample=resample,
|
|
||||||
return_tensors=return_tensors,
|
|
||||||
data_format=data_format,
|
|
||||||
)
|
)
|
||||||
interpolation = (
|
|
||||||
pil_torch_interpolation_mapping[resample] if isinstance(resample, (PILImageResampling, int)) else resample
|
|
||||||
)
|
|
||||||
|
|
||||||
if images is not None:
|
|
||||||
images = make_flat_list_of_images(images)
|
|
||||||
|
|
||||||
if images is not None and not valid_images(images):
|
|
||||||
raise ValueError(
|
|
||||||
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
|
|
||||||
"torch.Tensor, tf.Tensor or jax.ndarray."
|
|
||||||
)
|
|
||||||
|
|
||||||
data = {}
|
|
||||||
if images is not None:
|
|
||||||
pixel_values, vision_grid_thws = [], []
|
|
||||||
for image in images:
|
|
||||||
patches, image_grid_thw = self._preprocess(
|
|
||||||
image,
|
|
||||||
do_resize=do_resize,
|
|
||||||
size=size,
|
|
||||||
interpolation=interpolation,
|
|
||||||
do_rescale=do_rescale,
|
|
||||||
rescale_factor=rescale_factor,
|
|
||||||
do_normalize=do_normalize,
|
|
||||||
image_mean=image_mean,
|
|
||||||
image_std=image_std,
|
|
||||||
patch_size=patch_size,
|
|
||||||
temporal_patch_size=temporal_patch_size,
|
|
||||||
merge_size=merge_size,
|
|
||||||
do_convert_rgb=do_convert_rgb,
|
|
||||||
input_data_format=input_data_format,
|
|
||||||
device=device,
|
|
||||||
disable_grouping=disable_grouping,
|
|
||||||
)
|
|
||||||
pixel_values.extend(patches)
|
|
||||||
vision_grid_thws.append(image_grid_thw)
|
|
||||||
pixel_values = torch.stack(pixel_values)
|
|
||||||
vision_grid_thws = torch.tensor(vision_grid_thws)
|
|
||||||
data.update({"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws})
|
|
||||||
|
|
||||||
# kept for BC only and should be removed after v5.0
|
|
||||||
if videos is not None:
|
|
||||||
logger.warning(
|
|
||||||
"`Qwen2VLImageProcessorFast` works only with image inputs and doesn't process videos anymore. "
|
|
||||||
"This is a deprecated behavior and will be removed in v5.0. "
|
|
||||||
"Your videos should be forwarded to `Qwen2VLVideoProcessor`. "
|
|
||||||
)
|
|
||||||
videos = make_batched_videos(videos)
|
|
||||||
pixel_values_videos, vision_grid_thws_videos = [], []
|
|
||||||
for images in videos:
|
|
||||||
patches, video_grid_thw = self._preprocess(
|
|
||||||
images,
|
|
||||||
do_resize=do_resize,
|
|
||||||
size=size,
|
|
||||||
interpolation=interpolation,
|
|
||||||
do_rescale=do_rescale,
|
|
||||||
rescale_factor=rescale_factor,
|
|
||||||
do_normalize=do_normalize,
|
|
||||||
image_mean=image_mean,
|
|
||||||
image_std=image_std,
|
|
||||||
patch_size=patch_size,
|
|
||||||
temporal_patch_size=temporal_patch_size,
|
|
||||||
merge_size=merge_size,
|
|
||||||
do_convert_rgb=do_convert_rgb,
|
|
||||||
input_data_format=input_data_format,
|
|
||||||
device=device,
|
|
||||||
disable_grouping=disable_grouping,
|
|
||||||
)
|
|
||||||
pixel_values_videos.extend(patches)
|
|
||||||
vision_grid_thws_videos.append(video_grid_thw)
|
|
||||||
pixel_values_videos = torch.stack(pixel_values_videos)
|
|
||||||
vision_grid_thws_videos = torch.tensor(vision_grid_thws_videos)
|
|
||||||
data.update({"pixel_values_videos": pixel_values_videos, "video_grid_thw": vision_grid_thws_videos})
|
|
||||||
|
|
||||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
|
||||||
|
|
||||||
def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
|
def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
|
||||||
"""
|
"""
|
||||||
A utility that returns number of image patches for a given image size.
|
A utility that returns number of image patches for a given image size.
|
||||||
|
|
||||||
|
Note: Do not remove this method! It is used by vLLM to infer the number of patches and placeholders
|
||||||
|
without an image input.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
height (`int`):
|
height (`int`):
|
||||||
Height of the input image.
|
Height of the input image.
|
||||||
|
|||||||
@@ -116,8 +116,21 @@ class Qwen2VLVideoProcessor(BaseVideoProcessor):
|
|||||||
model_input_names = ["pixel_values_videos", "video_grid_thw"]
|
model_input_names = ["pixel_values_videos", "video_grid_thw"]
|
||||||
|
|
||||||
def __init__(self, **kwargs: Unpack[Qwen2VLVideoProcessorInitKwargs]):
|
def __init__(self, **kwargs: Unpack[Qwen2VLVideoProcessorInitKwargs]):
|
||||||
super().__init__(**kwargs)
|
size = kwargs.pop("size", None)
|
||||||
self.size = {"shortest_edge": self.min_pixels, "longest_edge": self.max_pixels}
|
min_pixels = kwargs.pop("min_pixels", None)
|
||||||
|
max_pixels = kwargs.pop("max_pixels", None)
|
||||||
|
# backward compatibility: override size with min_pixels and max_pixels if they are provided
|
||||||
|
size = self.size if size is None else size
|
||||||
|
if min_pixels is not None:
|
||||||
|
size["shortest_edge"] = min_pixels
|
||||||
|
size.pop("min_pixels", None)
|
||||||
|
if max_pixels is not None:
|
||||||
|
size["longest_edge"] = max_pixels
|
||||||
|
size.pop("max_pixels", None)
|
||||||
|
if "shortest_edge" not in size or "longest_edge" not in size:
|
||||||
|
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
||||||
|
|
||||||
|
super().__init__(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs)
|
||||||
|
|
||||||
def sample_frames(
|
def sample_frames(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -25,10 +25,17 @@ from huggingface_hub import hf_hub_download
|
|||||||
from transformers import (
|
from transformers import (
|
||||||
AutoProcessor,
|
AutoProcessor,
|
||||||
Qwen2_5OmniProcessor,
|
Qwen2_5OmniProcessor,
|
||||||
Qwen2Tokenizer,
|
Qwen2TokenizerFast,
|
||||||
WhisperFeatureExtractor,
|
WhisperFeatureExtractor,
|
||||||
)
|
)
|
||||||
from transformers.testing_utils import require_av, require_librosa, require_torch, require_torchaudio, require_vision
|
from transformers.testing_utils import (
|
||||||
|
require_av,
|
||||||
|
require_librosa,
|
||||||
|
require_torch,
|
||||||
|
require_torchaudio,
|
||||||
|
require_torchvision,
|
||||||
|
require_vision,
|
||||||
|
)
|
||||||
from transformers.utils import is_torch_available, is_vision_available
|
from transformers.utils import is_torch_available, is_vision_available
|
||||||
|
|
||||||
from ...test_processing_common import ProcessorTesterMixin
|
from ...test_processing_common import ProcessorTesterMixin
|
||||||
@@ -38,12 +45,13 @@ if is_torch_available():
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
if is_vision_available():
|
if is_vision_available():
|
||||||
from transformers import Qwen2VLImageProcessor
|
from transformers import Qwen2VLImageProcessorFast
|
||||||
|
|
||||||
|
|
||||||
@require_vision
|
@require_vision
|
||||||
@require_torch
|
@require_torch
|
||||||
@require_torchaudio
|
@require_torchaudio
|
||||||
|
@require_torchvision
|
||||||
class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||||
processor_class = Qwen2_5OmniProcessor
|
processor_class = Qwen2_5OmniProcessor
|
||||||
|
|
||||||
@@ -244,13 +252,13 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
processor.save_pretrained(self.tmpdirname)
|
processor.save_pretrained(self.tmpdirname)
|
||||||
processor = Qwen2_5OmniProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
processor = Qwen2_5OmniProcessor.from_pretrained(self.tmpdirname, use_fast=True)
|
||||||
|
|
||||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
||||||
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
|
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
|
||||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
||||||
self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer)
|
self.assertIsInstance(processor.tokenizer, Qwen2TokenizerFast)
|
||||||
self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor)
|
self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessorFast)
|
||||||
self.assertIsInstance(processor.feature_extractor, WhisperFeatureExtractor)
|
self.assertIsInstance(processor.feature_extractor, WhisperFeatureExtractor)
|
||||||
|
|
||||||
def test_image_processor(self):
|
def test_image_processor(self):
|
||||||
@@ -267,8 +275,8 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
|
|
||||||
input_image_proc = image_processor(image_input, return_tensors="np")
|
input_image_proc = image_processor(image_input, return_tensors="pt")
|
||||||
input_processor = processor(images=image_input, text="dummy", return_tensors="np")
|
input_processor = processor(images=image_input, text="dummy", return_tensors="pt")
|
||||||
|
|
||||||
for key in input_image_proc.keys():
|
for key in input_image_proc.keys():
|
||||||
self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
|
self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||||
|
|||||||
@@ -20,15 +20,15 @@ import unittest
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from transformers import AutoProcessor, Qwen2Tokenizer
|
from transformers import AutoProcessor, Qwen2TokenizerFast
|
||||||
from transformers.testing_utils import require_av, require_torch, require_vision
|
from transformers.testing_utils import require_av, require_torch, require_torchvision, require_vision
|
||||||
from transformers.utils import is_torch_available, is_vision_available
|
from transformers.utils import is_torch_available, is_vision_available
|
||||||
|
|
||||||
from ...test_processing_common import ProcessorTesterMixin
|
from ...test_processing_common import ProcessorTesterMixin
|
||||||
|
|
||||||
|
|
||||||
if is_vision_available():
|
if is_vision_available():
|
||||||
from transformers import Qwen2_5_VLProcessor, Qwen2VLImageProcessor
|
from transformers import Qwen2_5_VLProcessor, Qwen2VLImageProcessorFast
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
@@ -36,6 +36,7 @@ if is_torch_available():
|
|||||||
|
|
||||||
@require_vision
|
@require_vision
|
||||||
@require_torch
|
@require_torch
|
||||||
|
@require_torchvision
|
||||||
class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||||
processor_class = Qwen2_5_VLProcessor
|
processor_class = Qwen2_5_VLProcessor
|
||||||
|
|
||||||
@@ -73,12 +74,12 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
|
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
|
||||||
)
|
)
|
||||||
processor.save_pretrained(self.tmpdirname)
|
processor.save_pretrained(self.tmpdirname)
|
||||||
processor = Qwen2_5_VLProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
processor = Qwen2_5_VLProcessor.from_pretrained(self.tmpdirname, use_fast=True)
|
||||||
|
|
||||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
||||||
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
|
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
|
||||||
self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer)
|
self.assertIsInstance(processor.tokenizer, Qwen2TokenizerFast)
|
||||||
self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor)
|
self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessorFast)
|
||||||
|
|
||||||
def test_image_processor(self):
|
def test_image_processor(self):
|
||||||
image_processor = self.get_image_processor()
|
image_processor = self.get_image_processor()
|
||||||
@@ -91,8 +92,8 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
|
|
||||||
input_image_proc = image_processor(image_input, return_tensors="np")
|
input_image_proc = image_processor(image_input, return_tensors="pt")
|
||||||
input_processor = processor(images=image_input, text="dummy", return_tensors="np")
|
input_processor = processor(images=image_input, text="dummy", return_tensors="pt")
|
||||||
|
|
||||||
for key in input_image_proc.keys():
|
for key in input_image_proc.keys():
|
||||||
self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
|
self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ import requests
|
|||||||
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
||||||
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
|
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
|
||||||
from transformers.testing_utils import require_torch, require_vision
|
from transformers.testing_utils import require_torch, require_vision
|
||||||
from transformers.utils import is_torch_available, is_vision_available
|
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||||
|
|
||||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs
|
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs
|
||||||
|
|
||||||
@@ -35,8 +35,8 @@ if is_vision_available():
|
|||||||
|
|
||||||
from transformers import Qwen2VLImageProcessor
|
from transformers import Qwen2VLImageProcessor
|
||||||
|
|
||||||
# if is_torchvision_available():
|
if is_torchvision_available():
|
||||||
# from transformers import Qwen2VLImageProcessorFast
|
from transformers import Qwen2VLImageProcessorFast
|
||||||
|
|
||||||
|
|
||||||
class Qwen2VLImageProcessingTester:
|
class Qwen2VLImageProcessingTester:
|
||||||
@@ -119,7 +119,7 @@ class Qwen2VLImageProcessingTester:
|
|||||||
@require_vision
|
@require_vision
|
||||||
class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||||
image_processing_class = Qwen2VLImageProcessor if is_vision_available() else None
|
image_processing_class = Qwen2VLImageProcessor if is_vision_available() else None
|
||||||
# fast_image_processing_class = Qwen2VLImageProcessorFast if is_torchvision_available() else None
|
fast_image_processing_class = Qwen2VLImageProcessorFast if is_torchvision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super().setUp()
|
super().setUp()
|
||||||
@@ -363,3 +363,34 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
|
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
|
||||||
|
|
||||||
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
|
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
|
||||||
|
self.assertEqual(encoding_slow.image_grid_thw.dtype, encoding_fast.image_grid_thw.dtype)
|
||||||
|
self._assert_slow_fast_tensors_equivalence(
|
||||||
|
encoding_slow.image_grid_thw.float(), encoding_fast.image_grid_thw.float()
|
||||||
|
)
|
||||||
|
|
||||||
|
@require_vision
|
||||||
|
@require_torch
|
||||||
|
def test_slow_fast_equivalence_batched(self):
|
||||||
|
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||||
|
self.skipTest(reason="Skipping slow/fast equivalence test")
|
||||||
|
|
||||||
|
if self.image_processing_class is None or self.fast_image_processing_class is None:
|
||||||
|
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
|
||||||
|
|
||||||
|
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
|
||||||
|
self.skipTest(
|
||||||
|
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
|
||||||
|
)
|
||||||
|
|
||||||
|
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
||||||
|
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||||
|
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||||
|
|
||||||
|
encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
|
||||||
|
encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
|
||||||
|
|
||||||
|
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
|
||||||
|
self.assertEqual(encoding_slow.image_grid_thw.dtype, encoding_fast.image_grid_thw.dtype)
|
||||||
|
self._assert_slow_fast_tensors_equivalence(
|
||||||
|
encoding_slow.image_grid_thw.float(), encoding_fast.image_grid_thw.float()
|
||||||
|
)
|
||||||
|
|||||||
@@ -20,18 +20,18 @@ import unittest
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from transformers import AutoProcessor, Qwen2Tokenizer
|
from transformers import AutoProcessor, Qwen2TokenizerFast
|
||||||
from transformers.testing_utils import require_av, require_torch, require_vision
|
from transformers.testing_utils import require_av, require_torch, require_torchvision, require_vision
|
||||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||||
|
|
||||||
from ...test_processing_common import ProcessorTesterMixin
|
from ...test_processing_common import ProcessorTesterMixin
|
||||||
|
|
||||||
|
|
||||||
if is_vision_available():
|
if is_vision_available():
|
||||||
from transformers import Qwen2VLImageProcessor, Qwen2VLProcessor
|
from transformers import Qwen2VLProcessor
|
||||||
|
|
||||||
if is_torchvision_available():
|
if is_torchvision_available():
|
||||||
from transformers import Qwen2VLVideoProcessor
|
from transformers import Qwen2VLImageProcessorFast, Qwen2VLVideoProcessor
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
@@ -39,6 +39,7 @@ if is_torch_available():
|
|||||||
|
|
||||||
@require_vision
|
@require_vision
|
||||||
@require_torch
|
@require_torch
|
||||||
|
@require_torchvision
|
||||||
class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||||
processor_class = Qwen2VLProcessor
|
processor_class = Qwen2VLProcessor
|
||||||
|
|
||||||
@@ -76,12 +77,12 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
|
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
|
||||||
)
|
)
|
||||||
processor.save_pretrained(self.tmpdirname)
|
processor.save_pretrained(self.tmpdirname)
|
||||||
processor = Qwen2VLProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
processor = Qwen2VLProcessor.from_pretrained(self.tmpdirname, use_fast=True)
|
||||||
|
|
||||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
||||||
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
|
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
|
||||||
self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer)
|
self.assertIsInstance(processor.tokenizer, Qwen2TokenizerFast)
|
||||||
self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor)
|
self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessorFast)
|
||||||
self.assertIsInstance(processor.video_processor, Qwen2VLVideoProcessor)
|
self.assertIsInstance(processor.video_processor, Qwen2VLVideoProcessor)
|
||||||
|
|
||||||
def test_image_processor(self):
|
def test_image_processor(self):
|
||||||
@@ -95,8 +96,8 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
|
|
||||||
input_image_proc = image_processor(image_input, return_tensors="np")
|
input_image_proc = image_processor(image_input, return_tensors="pt")
|
||||||
input_processor = processor(images=image_input, text="dummy", return_tensors="np")
|
input_processor = processor(images=image_input, text="dummy", return_tensors="pt")
|
||||||
|
|
||||||
for key in input_image_proc.keys():
|
for key in input_image_proc.keys():
|
||||||
self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
|
self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||||
|
|||||||
@@ -937,7 +937,7 @@ class ProcessorTesterMixin:
|
|||||||
"video", batch_size, return_tensors, "videos_input_name", "video_processor", MODALITY_INPUT_DATA["videos"]
|
"video", batch_size, return_tensors, "videos_input_name", "video_processor", MODALITY_INPUT_DATA["videos"]
|
||||||
)
|
)
|
||||||
|
|
||||||
@parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")])
|
@parameterized.expand([(1, "pt"), (2, "pt")]) # fast image processors supports only torchvision
|
||||||
def test_apply_chat_template_image(self, batch_size: int, return_tensors: str):
|
def test_apply_chat_template_image(self, batch_size: int, return_tensors: str):
|
||||||
self._test_apply_chat_template(
|
self._test_apply_chat_template(
|
||||||
"image", batch_size, return_tensors, "images_input_name", "image_processor", MODALITY_INPUT_DATA["images"]
|
"image", batch_size, return_tensors, "images_input_name", "image_processor", MODALITY_INPUT_DATA["images"]
|
||||||
|
|||||||
Reference in New Issue
Block a user